From 421265de87f1398df8c905b35cda45a0b5280112 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Wed, 31 Dec 2025 09:34:04 +0000
Subject: [PATCH 01/13] feat(qualcomm, qwen3 ptq): PTQ py script for quantize
 qwen3 for qualcomm NPU.

---
 .../backends/qualcomm/transformers/.gitignore |   1 +
 .../__init__.py}                              |   0
 .../qualcomm/transformers/core/qdq.py         |  53 ++
 .../qualcomm/transformers/core/qlinear.py     | 187 +++++++
 .../qualcomm/transformers/core/rms_norm.py    |  72 +++
 .../transformers/core/test_qlinear.py         |  89 ++++
 .../qualcomm/transformers/static_qwen3.py     | 458 +++++++++++++++++-
 7 files changed, 858 insertions(+), 2 deletions(-)
 create mode 100644 pymllm/backends/qualcomm/transformers/.gitignore
 rename pymllm/backends/qualcomm/transformers/{static_qwen3_quantizer.py => core/__init__.py} (100%)
 create mode 100644 pymllm/backends/qualcomm/transformers/core/qdq.py
 create mode 100644 pymllm/backends/qualcomm/transformers/core/qlinear.py
 create mode 100644 pymllm/backends/qualcomm/transformers/core/rms_norm.py
 create mode 100644 pymllm/backends/qualcomm/transformers/core/test_qlinear.py

diff --git a/pymllm/backends/qualcomm/transformers/.gitignore b/pymllm/backends/qualcomm/transformers/.gitignore
new file mode 100644
index 000000000..198e9b7c5
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/.gitignore
@@ -0,0 +1 @@
+static_one_more_thing.py
diff --git a/pymllm/backends/qualcomm/transformers/static_qwen3_quantizer.py b/pymllm/backends/qualcomm/transformers/core/__init__.py
similarity index 100%
rename from pymllm/backends/qualcomm/transformers/static_qwen3_quantizer.py
rename to pymllm/backends/qualcomm/transformers/core/__init__.py
diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py
new file mode 100644
index 000000000..9d087a7b9
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/core/qdq.py
@@ -0,0 +1,53 @@
+import torch
+import torch.nn as nn
+from torch.ao.quantization import FakeQuantize, MinMaxObserver
+
+
+class ActivationQDQInt16PerTensorSym(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fake_quant = FakeQuantize(
+            observer=MinMaxObserver,
+            quant_min=-32768,
+            quant_max=32767,
+            dtype=torch.qint32,
+            qscheme=torch.per_tensor_symmetric,
+        )
+        self.enable_observer()
+
+    def forward(self, x):
+        return self.fake_quant(x)
+
+    def enable_observer(self):
+        self.fake_quant.enable_observer()
+
+    def disable_observer(self):
+        self.fake_quant.disable_observer()
+
+
+class ActivationQDQInt8PerTensorSym(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fake_quant = FakeQuantize(
+            observer=MinMaxObserver,
+            quant_min=-128,
+            quant_max=127,
+            dtype=torch.qint32,
+            qscheme=torch.per_tensor_symmetric,
+        )
+        self.enable_observer()
+
+    def forward(self, x):
+        return self.fake_quant(x)
+
+    def enable_observer(self):
+        self.fake_quant.enable_observer()
+
+    def disable_observer(self):
+        self.fake_quant.disable_observer()
+
+
+QDQ_OP = {
+    "A8-PerTensor": ActivationQDQInt8PerTensorSym,
+    "A16-PerTensor": ActivationQDQInt16PerTensorSym,
+}
diff --git a/pymllm/backends/qualcomm/transformers/core/qlinear.py b/pymllm/backends/qualcomm/transformers/core/qlinear.py
new file mode 100644
index 000000000..654f05f3d
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/core/qlinear.py
@@ -0,0 +1,187 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.quantization import FakeQuantize, MinMaxObserver, PerChannelMinMaxObserver
+
+
+class QLinear(nn.Module):
+    def __init__(self, in_features, out_features, bias=True):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.randn(out_features, in_features, dtype=torch.bfloat16)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.bfloat16))
+        else:
+            self.register_parameter("bias", None)
+
+        self.act_quant = None
+        self.weight_quant = None
+        self.w_q_cache = None
+
+    def _setup_status(self, already_quantized_w, already_quantized_a):
+        if self.act_quant:
+            if already_quantized_a:
+                self.act_quant.disable_observer()
+            else:
+                self.act_quant.enable_observer()
+        if self.weight_quant:
+            if already_quantized_w:
+                self.weight_quant.disable_observer()
+            else:
+                self.weight_quant.enable_observer()
+
+    def _clear_cache(self):
+        self.w_q_cache = None
+
+
+class QLinearW8A16_PerChannelSym_PerTensorSym(QLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        already_quantized_weight=False,
+        already_quantized_activation=False,
+    ):
+        super().__init__(in_features, out_features, bias)
+
+        self.weight_quant = FakeQuantize(
+            observer=PerChannelMinMaxObserver,
+            quant_min=-128,
+            quant_max=127,
+            dtype=torch.qint32,
+            qscheme=torch.per_channel_symmetric,
+            ch_axis=0,
+        )
+        self._setup_status(already_quantized_weight, already_quantized_activation)
+
+    def forward(self, x):
+        x_q = x
+        if self.w_q_cache is not None:
+            w_q = self.w_q_cache
+        else:
+            w_q = self.weight_quant(self.weight)
+            self.w_q_cache = w_q
+        return F.linear(x_q, w_q, self.bias)
+
+
+class QLinearLPBQ(QLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        block_size=64,
+        already_quantized_weight=False,
+        already_quantized_activation=False,
+    ):
+        super().__init__(in_features, out_features, bias)
+
+        self.block_size = block_size
+        self.already_quantized_w = already_quantized_weight
+
+        # Define buffers to store quantization parameters
+        # Initially set to None, populated during first forward pass, or saved to state_dict
+        self.register_buffer("scale_2_fp32", None)  # Level 2 Scale (FP32/BF16)
+        self.register_buffer(
+            "scale_1_uint4", None
+        )  # Level 1 Scale Indices (Uint4 stored as Uint8)
+        self.register_buffer("weight_q", None)  # Weight Indices (Int4 stored as Int8)
+
+        self._setup_status(already_quantized_weight, already_quantized_activation)
+
+    def _fake_quant_weight_double(self, w):
+        """
+        Double quantization calculation (no STE, forward-only simulation)
+        And save quantization parameters to Buffer
+        """
+        out_channels, in_channels = w.shape
+
+        # 1. Padding
+        padding = 0
+        if in_channels % self.block_size != 0:
+            padding = self.block_size - (in_channels % self.block_size)
+            w = F.pad(w, (0, padding), "constant", 0)
+
+        # Reshape: [Out, Num_Blocks, Block_Size]
+        w_reshaped = w.view(out_channels, -1, self.block_size)
+
+        # =======================================================
+        # Level 1 Scale Calculation (Ideal FP32)
+        # =======================================================
+        w_int4_max = 7.0
+        # w_int4_min = -8.0
+
+        # [Out, Num_Blocks, 1]
+        w_abs_max = w_reshaped.abs().amax(dim=-1, keepdim=True)
+        scale_1_fp32 = w_abs_max / w_int4_max
+        scale_1_fp32 = torch.clamp(scale_1_fp32, min=1e-8)
+
+        # =======================================================
+        # Level 2 Scale Calculation & Level 1 Scale Quantization
+        # =======================================================
+        s_uint4_max = 15.0
+        s_uint4_min = 0.0
+
+        # Calculate Level 2 Scale (Per-Channel FP32) -> [Out, 1, 1]
+        scale_2_fp32 = scale_1_fp32.amax(dim=1, keepdim=True) / s_uint4_max
+        scale_2_fp32 = torch.clamp(scale_2_fp32, min=1e-8)
+
+        # Quantize Level 1 Scale: FP32 -> Uint4 Indices
+        scale_1_q = torch.round(scale_1_fp32 / scale_2_fp32)
+        scale_1_q = torch.clamp(scale_1_q, s_uint4_min, s_uint4_max)
+
+        # Dequantize Level 1 Scale
+        scale_1_recon = scale_1_q * scale_2_fp32
+
+        # =======================================================
+        # Apply Level 1 Quantization (Quantize Weights)
+        # =======================================================
+        w_int4_min = -8.0
+
+        # Quantize Weight: FP32 -> Int4 Indices
+        w_q = torch.round(w_reshaped / scale_1_recon)
+        w_q = torch.clamp(w_q, w_int4_min, w_int4_max)
+
+        # Dequantize Weight
+        w_recon = w_q * scale_1_recon
+
+        # =======================================================
+        # [NEW] Store Scales and Indices
+        # =======================================================
+        # Note: We store Indices here, typically converted to int8/uint8 to save space
+        # scale_2 itself is a floating-point number, kept as is
+        self.scale_2_fp32 = scale_2_fp32.detach()
+        # scale_1_q is 0-15, stored as uint8
+        self.scale_1_uint4 = scale_1_q.detach().to(torch.uint8)
+        # w_q is -8 to 7, stored as int8
+        self.weight_q = w_q.detach().to(torch.int8)
+
+        # =======================================================
+        # Restore Shape
+        # =======================================================
+        w_out = w_recon.view(out_channels, -1)
+        if padding > 0:
+            w_out = w_out[:, :-padding]
+
+        return w_out.to(torch.bfloat16)
+
+    def forward(self, x):
+        x_q = x
+
+        if self.w_q_cache is not None:
+            w_q = self.w_q_cache
+        else:
+            if self.already_quantized_w:
+                w_q = self.weight
+            else:
+                # Real-time calculation and update of self.scale_2, self.scale_1_idx, self.weight_idx
+                w_q = self._fake_quant_weight_double(self.weight)
+
+            if self.use_weight_cache:
+                self.w_q_cache = w_q
+
+        return F.linear(x_q, w_q, self.bias)
diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
new file mode 100644
index 000000000..eb9ec8d88
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
@@ -0,0 +1,72 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.quantization import FakeQuantize, MinMaxObserver
+
+
+class QRMSNorm(nn.Module):
+    """
+    RMSNorm with int16 per-tensor symmetric quantized weight.
+
+    This implementation applies quantization to the weight tensor only,
+    using per-tensor symmetric quantization with int16 range.
+    """
+
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-6,
+        elementwise_affine=True,
+        already_quantized_weight=False,
+    ):
+        super().__init__()
+
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.eps = eps
+        self.already_quantized_w = already_quantized_weight
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(
+                torch.ones(normalized_shape, dtype=torch.bfloat16)
+            )
+        else:
+            self.register_parameter("weight", None)
+
+        # Weight quantization for int16 per-tensor symmetric
+        self.weight_quant = FakeQuantize(
+            observer=MinMaxObserver,
+            quant_min=-32768,
+            quant_max=32767,
+            dtype=torch.qint32,
+            qscheme=torch.per_tensor_symmetric,
+        )
+
+        self.w_q_cache = None
+        self.use_weight_cache = already_quantized_weight
+
+    def _clear_cache(self):
+        self.w_q_cache = None
+
+    def forward(self, x):
+        # Compute RMS norm
+        variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+
+        # Apply quantized weight
+        if self.weight is not None:
+            if self.w_q_cache is not None:
+                w_q = self.w_q_cache
+            else:
+                if self.already_quantized_w:
+                    w_q = self.weight
+                else:
+                    w_q = self.weight_quant(self.weight)
+
+                if self.use_weight_cache:
+                    self.w_q_cache = w_q
+
+            x = x * w_q
+
+        return x
diff --git a/pymllm/backends/qualcomm/transformers/core/test_qlinear.py b/pymllm/backends/qualcomm/transformers/core/test_qlinear.py
new file mode 100644
index 000000000..69edd69f6
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/core/test_qlinear.py
@@ -0,0 +1,89 @@
+import torch
+import torch.nn as nn
+from pymllm.backends.qualcomm.transformers.core.qlinear import QLinearLPBQ
+
+
+def test_qlinear_lpbq():
+    """
+    Test QLinearLPBQ implementation against bf16 baseline.
+
+    This test verifies that the double quantization implementation
+    produces results close to the bf16 baseline when using appropriate
+    quantization parameters.
+    """
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Test parameters
+    in_features = 256
+    out_features = 128
+    batch_size = 4
+    seq_len = 16
+    block_size = 64
+
+    # Create input tensor (bf16 baseline)
+    x_bf16 = torch.randn(batch_size, seq_len, in_features, dtype=torch.bfloat16)
+
+    # Create reference linear layer (bf16)
+    linear_bf16 = nn.Linear(in_features, out_features, bias=True, dtype=torch.bfloat16)
+    # Copy weights and bias to ensure same values
+    with torch.no_grad():
+        linear_bf16.weight.copy_(
+            torch.randn(out_features, in_features, dtype=torch.bfloat16)
+        )
+        linear_bf16.bias.copy_(torch.zeros(out_features, dtype=torch.bfloat16))
+
+    # Get bf16 reference output
+    with torch.no_grad():
+        output_bf16 = linear_bf16(x_bf16)
+
+    # Create QLinearLPBQ with same weights
+    qlinear = QLinearLPBQ(
+        in_features=in_features,
+        out_features=out_features,
+        bias=True,
+        block_size=block_size,
+        already_quantized_weight=False,
+        already_quantized_activation=False,
+    )
+
+    # Copy the same weights and bias
+    with torch.no_grad():
+        qlinear.weight.copy_(linear_bf16.weight.data)
+        if qlinear.bias is not None:
+            qlinear.bias.copy_(linear_bf16.bias.data)
+
+    # Get quantized output
+    with torch.no_grad():
+        output_q = qlinear(x_bf16)
+    output_q_bf16 = output_q
+
+    # Calculate metrics
+    mse = torch.mean((output_bf16 - output_q_bf16) ** 2)
+    mae = torch.mean(torch.abs(output_bf16 - output_q_bf16))
+
+    # Calculate relative error
+    relative_error = torch.mean(
+        torch.abs(output_bf16 - output_q_bf16) / (torch.abs(output_bf16) + 1e-8)
+    )
+
+    # Print results
+    print("=== QLinearLPBQ Test Results ===")
+    print(f"Input shape: {x_bf16.shape}")
+    print(f"Output shape: {output_bf16.shape}")
+    print(f"Block size: {block_size}")
+    print("\nComparison with bf16 baseline:")
+    print(f"MSE: {mse:.6e}")
+    print(f"MAE: {mae:.6e}")
+    print(f"Relative Error: {relative_error:.6e}")
+
+    # Check if results are within acceptable tolerance
+    # For double quantization, we expect some error but should be reasonable
+    tolerance = 0.1  # 10% relative error tolerance
+
+    if relative_error < tolerance:
+        print(f"\n✓ TEST PASSED: Relative error {relative_error:.6e} < {tolerance}")
+        return True
+    else:
+        print(f"\n✗ TEST FAILED: Relative error {relative_error:.6e} >= {tolerance}")
+        return False
diff --git a/pymllm/backends/qualcomm/transformers/static_qwen3.py b/pymllm/backends/qualcomm/transformers/static_qwen3.py
index 64a1c25b7..1f341648b 100644
--- a/pymllm/backends/qualcomm/transformers/static_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/static_qwen3.py
@@ -1,3 +1,457 @@
 import torch
-import torchao
-from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e
+from torch import nn
+from torch.nn import functional as F
+from pymllm.backends.qualcomm.transformers.core.qdq import QDQ_OP
+from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm
+from pymllm.backends.qualcomm.transformers.core.qlinear import (
+    QLinearLPBQ,
+    QLinearW8A16_PerChannelSym_PerTensorSym,
+)
+
+
+class Qwen3Config:
+    def __init__(self):
+        self.attention_bias = False
+        self.attention_dropout = 0.0
+        self.bos_token_id = 151643
+        self.eos_token_id = 151645
+        self.head_dim = 128
+        self.hidden_act = "silu"
+        self.hidden_size = 2048
+        self.initializer_range = 0.02
+        self.intermediate_size = 6144
+        self.max_position_embeddings = 40960
+        self.max_window_layers = 28
+        self.model_type = "qwen3"
+        self.num_attention_heads = 16
+        self.num_hidden_layers = 28
+        self.num_key_value_heads = 8
+        self.pad_token_id = 151643
+        self.rms_norm_eps = 1e-06
+        self.rope_scaling = None
+        self.rope_theta = 1000000
+        self.sliding_window = None
+        self.tie_word_embeddings = True
+        self.torch_dtype = "bfloat16"
+        self.transformers_version = "4.51.0"
+        self.use_cache = True
+        self.use_sliding_window = False
+        self.vocab_size = 151936
+
+
+def generate_rope_cache(
+    max_length: int,
+    head_dim: int,
+    rope_theta: float,
+    dtype=torch.bfloat16,
+    device="cpu",
+):
+    """
+    Generate RoPE (Rotary Position Embedding) cache for given max_length.
+
+    Args:
+        max_length: Maximum sequence length
+        head_dim: Dimension of each attention head
+        rope_theta: RoPE theta parameter (frequency base)
+        dtype: Data type for the embeddings
+        device: Device to place the embeddings on
+
+    Returns:
+        tuple: (cos, sin) embeddings of shape [max_length, head_dim]
+    """
+    inv_freq = 1.0 / (
+        rope_theta
+        ** (torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) / head_dim)
+    )
+    t = torch.arange(max_length, dtype=torch.float32, device=device)
+    freqs = torch.einsum("i,j->ij", t, inv_freq)
+    emb = torch.cat((freqs, freqs), dim=-1)
+    cos = emb.cos().to(dtype)
+    sin = emb.sin().to(dtype)
+    return cos, sin
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Qwen3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = QLinearLPBQ(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            block_size=32,
+        )
+        self.up_proj = QLinearLPBQ(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            block_size=32,
+        )
+        self.down_proj = QLinearLPBQ(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            block_size=32,
+        )
+        self.act_fn = nn.SiLU()
+
+        # QDQ
+        self.qdq_x = QDQ_OP["A16-PerTensor"]()
+        self.qdq_up_result = QDQ_OP["A16-PerTensor"]()
+        self.qdq_gate_result = QDQ_OP["A16-PerTensor"]()
+        self.qdq_act = QDQ_OP["A16-PerTensor"]()
+        self.qdq_middle = QDQ_OP["A16-PerTensor"]()
+
+    def forward(self, x):
+        """
+        input:
+            x: bf16, w/o fakequant
+        output:
+            o: bf16, w/o fakequant
+        """
+        x = self.qdq_x(x)
+        up_result = self.qdq_up_result(self.up_proj(x))
+        gate_result = self.qdq_gate_result(self.gate_proj(x))
+        up_result = self.qdq_act(self.act_fn(up_result))
+        o = self.qdq_middle(gate_result * up_result)
+        o = self.down_proj(o)
+        return o
+
+
+class Qwen3Attention(nn.Module):
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.num_key_value_groups = (
+            config.num_attention_heads // config.num_key_value_heads
+        )
+        self.scaling = self.head_dim**-0.5
+        self.q_proj = QLinearLPBQ(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=False,
+            block_size=32,
+        )
+        self.k_proj = QLinearLPBQ(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=False,
+            block_size=32,
+        )
+        self.v_proj = QLinearLPBQ(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=False,
+            block_size=32,
+        )
+        self.o_proj = QLinearLPBQ(
+            config.num_attention_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            block_size=32,
+        )
+        self.q_norm = QRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = QRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # QDQ
+        self.qdq_hidden_states = QDQ_OP["A16-PerTensor"]()
+        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_1 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_2 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_3 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_4 = QDQ_OP["A8-PerTensor"]()
+        self.qdq_5 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_6 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_7 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_8 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_9 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_10 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_11 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_12 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_13 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_14 = QDQ_OP["A8-PerTensor"]()
+
+        self.qdq_rope_0 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_rope_1 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_rope_2 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_rope_3 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_rope_4 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_rope_5 = QDQ_OP["A16-PerTensor"]()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        sin: torch.Tensor,
+        cos: torch.Tensor,
+        causal_mask: torch.Tensor,
+    ):
+        """
+        input:
+            hidden_states: bf16, w/o fakequant
+        output:
+            o: bf16, w/o fakequant
+        """
+        bsz, seq_len, _ = hidden_states.shape
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        quantized_hidden_states = self.qdq_hidden_states(hidden_states)
+
+        # [B, H, S, D]
+        query_states = (
+            self.q_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
+        )
+        key_states = (
+            self.k_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
+        )
+        value_states = (
+            self.v_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
+        )
+
+        query_states = self.q_norm(self.qdq_0(query_states))
+        query_states = self.qdq_1(query_states)
+
+        key_states = self.k_norm(self.qdq_2(key_states))
+        key_states = self.qdq_3(key_states)
+
+        # ROPE Here
+        # cos = cos.unsqueeze(unsqueeze_dim)
+        # sin = sin.unsqueeze(unsqueeze_dim)
+        # q_embed = (q * cos) + (rotate_half(q) * sin)
+        # k_embed = (k * cos) + (rotate_half(k) * sin)
+        cos_embedding = cos.unsqueeze(1)
+        sin_embedding = sin.unsqueeze(1)
+        rot_q = rotate_half(query_states)
+        rot_k = rotate_half(key_states)
+        query_states = self.qdq_rope_0(
+            self.qdq_rope_1(query_states * cos_embedding)
+            + self.qdq_rope_2(rot_q * sin_embedding)
+        )
+        key_states = self.qdq_rope_3(
+            self.qdq_rope_4(key_states * cos_embedding)
+            + self.qdq_rope_5(rot_k * sin_embedding)
+        )
+
+        key_states = self.qdq_4(key_states)
+        key_states = key_states.transpose(2, 3)  # [B, H, D, S]
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+
+        attn = query_states @ key_states
+        attn = self.qdq_5(attn)
+        attn = attn / self.qdq_6(torch.ones(1, dtype=torch.bfloat16) * self.scaling)
+        attn = self.qdq_7(attn)
+        attn_min = torch.amin(attn, dim=-1, keepdim=True)
+        attn_min = self.qdq_8(attn_min)
+        attn_vv = attn_min - 20
+        attn_vv = self.qdq_9(attn_vv)
+        attn = torch.where(causal_mask == 0, attn, attn_vv)
+        attn = self.qdq_10(attn)
+        attn = F.softmax(attn, -1)
+        attn = self.qdq_11(attn)
+        y = attn @ self.qdq_14(self.qdq_13(value_states))
+        y = self.qdq_12(y)
+        y = y.transpose(1, 2).reshape(bsz, seq_len, -1)
+        y = self.o_proj(y)
+        return y
+
+
+class Qwen3DecodeLayer(nn.Module):
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = QRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = QRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_1 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_2 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_3 = QDQ_OP["A16-PerTensor"]()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        sin: torch.Tensor,
+        cos: torch.Tensor,
+        causal_mask: torch.Tensor,
+    ):
+        """
+        inputs:
+            hidden_states: bf16, w/o fakequant
+        outputs:
+            hidden_states: bf16, w/o fakequant
+        """
+        hidden_states = self.qdq_0(hidden_states)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states,
+            sin,
+            cos,
+            causal_mask,
+        )
+        hidden_states = self.qdq_2(residual + self.qdq_1(hidden_states))
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.qdq_3(hidden_states)
+        return hidden_states
+
+
+class Qwen3Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen3DecodeLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = QRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
+
+    def forward(self, input_ids, sin, cos, causal_mask):
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(hidden_states, sin, cos, causal_mask)
+
+        hidden_states = self.norm(self.qdq_0(hidden_states))
+        return hidden_states
+
+
+class Qwen3ForCausalLM:
+    def __init__(self, config):
+        self.config = config
+        self.model = Qwen3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = QLinearW8A16_PerChannelSym_PerTensorSym(
+            config.hidden_size, config.vocab_size, bias=False
+        )
+        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_1 = QDQ_OP["A16-PerTensor"]()
+        self.qdq_2 = QDQ_OP["A16-PerTensor"]()
+
+        # Register sin and cos as buffers
+        self.register_buffer("sin", None)
+        self.register_buffer("cos", None)
+
+        self.k_cache = None
+        self.v_cache = None
+
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        max_length,
+    ):
+        bsz, seq_len = input_ids.shape
+
+        # Generate causal mask based on position_ids length
+        # For prefill, we need a lower triangular mask
+        causal_mask = 1 - torch.tril(
+            torch.ones(seq_len, seq_len, dtype=torch.int8, device=input_ids.device)
+        )
+        causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, seq_len]
+
+        # Generate or use registered RoPE embeddings
+        if self.sin is None or self.cos is None or self.cos.shape[0] < max_length:
+            cos, sin = generate_rope_cache(
+                max_length,
+                head_dim=self.config.head_dim,
+                rope_theta=self.config.rope_theta,
+                dtype=torch.bfloat16,
+                device=input_ids.device,
+            )
+            # Register the generated embeddings
+            self.sin = self.qdq_1(sin)
+            self.cos = self.qdq_2(cos)
+
+        if self.k_cache is None or self.v_cache is None:
+            pass
+
+        # Slice RoPE embeddings to current sequence length
+        cos = self.cos[position_ids]
+        sin = self.sin[position_ids]
+
+        out = self.model(input_ids, sin, cos, causal_mask)
+        logits = self.lm_head(self.qdq_0(out))
+        return logits
+
+    def _update_kv_cache_by_copy(self):
+        pass
+
+    def _freeze_observer(self):
+        pass
+
+    def infer(self, model_path: str, prompt: str, max_length) -> str:
+        pass
+
+    def calibrate(self, model_path: str, dataset_path: str):
+        pass

From a18fe97d7ef2a8baadf6afc9a92505efad1c7e15 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Wed, 31 Dec 2025 09:38:39 +0000
Subject: [PATCH 02/13] fix: add som comments and fix causal mask decoding bug.

---
 .../qualcomm/transformers/static_qwen3.py     | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/pymllm/backends/qualcomm/transformers/static_qwen3.py b/pymllm/backends/qualcomm/transformers/static_qwen3.py
index 1f341648b..d7a6ffb13 100644
--- a/pymllm/backends/qualcomm/transformers/static_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/static_qwen3.py
@@ -9,6 +9,7 @@
 )
 
 
+# This settings below is for Qwen1.7B
 class Qwen3Config:
     def __init__(self):
         self.attention_bias = False
@@ -411,14 +412,21 @@ def forward(
         position_ids,
         max_length,
     ):
-        bsz, seq_len = input_ids.shape
+        _, seq_len = input_ids.shape
 
         # Generate causal mask based on position_ids length
         # For prefill, we need a lower triangular mask
-        causal_mask = 1 - torch.tril(
-            torch.ones(seq_len, seq_len, dtype=torch.int8, device=input_ids.device)
-        )
-        causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, seq_len]
+        if seq_len != 1:
+            causal_mask = 1 - torch.tril(
+                torch.ones(seq_len, seq_len, dtype=torch.int8, device=input_ids.device)
+            )
+            # [1, 1, seq_len, seq_len]
+            causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+        else:
+            # [1, 1, seq_len, seq_len]
+            causal_mask = torch.zeros(
+                (1, 1, 1, seq_len), dtype=torch.int8, device=input_ids.device
+            )
 
         # Generate or use registered RoPE embeddings
         if self.sin is None or self.cos is None or self.cos.shape[0] < max_length:
@@ -454,4 +462,7 @@ def infer(self, model_path: str, prompt: str, max_length) -> str:
         pass
 
     def calibrate(self, model_path: str, dataset_path: str):
+        """
+        calibrate Only on PREFILL stage !!!
+        """
         pass

From ca558923f4d5686d37ecfd1104cc0ad70ad92713 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Wed, 31 Dec 2025 09:54:31 +0000
Subject: [PATCH 03/13] feat: add kvcache in attention

---
 .../qualcomm/transformers/static_qwen3.py     | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/pymllm/backends/qualcomm/transformers/static_qwen3.py b/pymllm/backends/qualcomm/transformers/static_qwen3.py
index d7a6ffb13..becbe9f48 100644
--- a/pymllm/backends/qualcomm/transformers/static_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/static_qwen3.py
@@ -233,6 +233,9 @@ def __init__(self, config, layer_idx: int):
         self.qdq_rope_4 = QDQ_OP["A16-PerTensor"]()
         self.qdq_rope_5 = QDQ_OP["A16-PerTensor"]()
 
+        self.k_cache = None
+        self.v_cache = None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -287,8 +290,25 @@ def forward(
         )
 
         key_states = self.qdq_4(key_states)
-        key_states = key_states.transpose(2, 3)  # [B, H, D, S]
+        # [B, H, D, S]
+        key_states = key_states.transpose(2, 3)
+        # [B, H, S, D]
+        value_states = self.qdq_14(self.qdq_13(value_states))
+
+        # KV Cache Here
+        if seq_len > 1 and self.k_cache is not None and self.v_cache is not None:
+            self.k_cache = None
+            self.v_cache = None
+
+        if seq_len == 1:
+            self.k_cache = torch.cat([self.k_cache, key_states], dim=-1)
+            self.v_cache = torch.cat([self.v_cache, value_states], dim=2)
+        else:
+            self.k_cache = key_states
+            self.v_cache = value_states
+
         key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
 
         attn = query_states @ key_states
         attn = self.qdq_5(attn)
@@ -302,7 +322,7 @@ def forward(
         attn = self.qdq_10(attn)
         attn = F.softmax(attn, -1)
         attn = self.qdq_11(attn)
-        y = attn @ self.qdq_14(self.qdq_13(value_states))
+        y = attn @ value_states
         y = self.qdq_12(y)
         y = y.transpose(1, 2).reshape(bsz, seq_len, -1)
         y = self.o_proj(y)
@@ -403,9 +423,6 @@ def __init__(self, config):
         self.register_buffer("sin", None)
         self.register_buffer("cos", None)
 
-        self.k_cache = None
-        self.v_cache = None
-
     def forward(
         self,
         input_ids,
@@ -441,9 +458,6 @@ def forward(
             self.sin = self.qdq_1(sin)
             self.cos = self.qdq_2(cos)
 
-        if self.k_cache is None or self.v_cache is None:
-            pass
-
         # Slice RoPE embeddings to current sequence length
         cos = self.cos[position_ids]
         sin = self.sin[position_ids]
@@ -465,4 +479,5 @@ def calibrate(self, model_path: str, dataset_path: str):
         """
         calibrate Only on PREFILL stage !!!
         """
+        # Call infer after calibrate done.
         pass

From dd58482dd5ad5b93cd48b2df02cb96f3a8fa1502 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Fri, 2 Jan 2026 15:58:28 +0000
Subject: [PATCH 04/13] feat: Qualcomm Calibrate Things

---
 mllm/backends/qnn/aot/QnnWrappersAPI.cpp      |   1 +
 .../qualcomm/transformers/core/qdq.py         |  59 +-
 .../qualcomm/transformers/core/qlinear.py     | 257 +++----
 .../qualcomm/transformers/core/rms_norm.py    |  89 ++-
 .../transformers/qwen3/modeling_qwen3.py      | 687 ++++++++++++++++++
 .../qualcomm/transformers/qwen3/runner.py     | 155 ++++
 .../qualcomm/transformers/static_qwen3.py     | 240 ++++--
 .../backends/qualcomm/transformers/train.py   |   6 +
 requirements-qnn-aot.txt                      |   1 +
 9 files changed, 1224 insertions(+), 271 deletions(-)
 create mode 100644 pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
 create mode 100644 pymllm/backends/qualcomm/transformers/qwen3/runner.py
 create mode 100644 requirements-qnn-aot.txt

diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
index 0fd354de3..0f29498f5 100644
--- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
+++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp
@@ -865,6 +865,7 @@ QnnAOTNodeTensor::ptr_t QnnAOTEnv::captureQnnAOTNodeTensor(const std::string& qn
   auto ret = QnnAOTNodeTensor::create(v, __qnn_enable_static_weight);
   if (__qnn_enable_static_weight) {
     contexts_[qnn_context_name]->static_tensor_.insert({__qnn_tensor_name, ret});
+    // FIXME, That may be error.
     qnn_htp_func_symbols_.qnn_interface_.tensorCreateContextTensor(contexts_[qnn_context_name]->qnn_ctx_handle_,
                                                                    ret->getQnnTensor());
   } else {
diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py
index 9d087a7b9..c7bc351de 100644
--- a/pymllm/backends/qualcomm/transformers/core/qdq.py
+++ b/pymllm/backends/qualcomm/transformers/core/qdq.py
@@ -3,41 +3,35 @@
 from torch.ao.quantization import FakeQuantize, MinMaxObserver
 
 
-class ActivationQDQInt16PerTensorSym(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.fake_quant = FakeQuantize(
-            observer=MinMaxObserver,
-            quant_min=-32768,
-            quant_max=32767,
-            dtype=torch.qint32,
-            qscheme=torch.per_tensor_symmetric,
-        )
-        self.enable_observer()
+class ActivationQDQ(nn.Module):
+    """
+    General activation value pseudo-quantization module (QDQ).
+    Supports symmetric Per-Tensor quantization, configurable bit numbers (e.g., 8-bit or 16-bit).
+    """
 
-    def forward(self, x):
-        return self.fake_quant(x)
-
-    def enable_observer(self):
-        self.fake_quant.enable_observer()
-
-    def disable_observer(self):
-        self.fake_quant.disable_observer()
+    def __init__(self, bits=8, qscheme=torch.per_tensor_symmetric):
+        super().__init__()
 
+        # 1. Calculate quantization range based on bits
+        # int8: -128 to 127
+        # int16: -32768 to 32767
+        self.quant_min = -(2 ** (bits - 1))
+        self.quant_max = 2 ** (bits - 1) - 1
 
-class ActivationQDQInt8PerTensorSym(nn.Module):
-    def __init__(self):
-        super().__init__()
+        # 2. Initialize FakeQuantize
+        # For activations, typically use MinMaxObserver or MovingAverageMinMaxObserver
         self.fake_quant = FakeQuantize(
-            observer=MinMaxObserver,
-            quant_min=-128,
-            quant_max=127,
+            observer=MinMaxObserver.with_args(qscheme=qscheme, dtype=torch.qint32),
+            quant_min=self.quant_min,
+            quant_max=self.quant_max,
             dtype=torch.qint32,
-            qscheme=torch.per_tensor_symmetric,
+            qscheme=qscheme,
         )
-        self.enable_observer()
 
     def forward(self, x):
+        # Directly apply pseudo-quantization.
+        # When observer is enabled, it continuously updates scale/zp;
+        # When fakequant is enabled, it simulates quantization errors.
         return self.fake_quant(x)
 
     def enable_observer(self):
@@ -46,8 +40,11 @@ def enable_observer(self):
     def disable_observer(self):
         self.fake_quant.disable_observer()
 
+    def enable_fakequant(self):
+        self.fake_quant.enable_fakequant()
+
+    def disable_fakequant(self):
+        self.fake_quant.disable_fakequant()
 
-QDQ_OP = {
-    "A8-PerTensor": ActivationQDQInt8PerTensorSym,
-    "A16-PerTensor": ActivationQDQInt16PerTensorSym,
-}
+    def extra_repr(self):
+        return f"bits={self.quant_max.bit_length() + 1}, q_range=({self.quant_min}, {self.quant_max})"
diff --git a/pymllm/backends/qualcomm/transformers/core/qlinear.py b/pymllm/backends/qualcomm/transformers/core/qlinear.py
index 654f05f3d..bbfcc60df 100644
--- a/pymllm/backends/qualcomm/transformers/core/qlinear.py
+++ b/pymllm/backends/qualcomm/transformers/core/qlinear.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.ao.quantization import FakeQuantize, MinMaxObserver, PerChannelMinMaxObserver
+from torch.ao.quantization import FakeQuantize, PerChannelMinMaxObserver
 
 
 class QLinear(nn.Module):
@@ -9,179 +9,144 @@ def __init__(self, in_features, out_features, bias=True):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
-        self.weight = nn.Parameter(
-            torch.randn(out_features, in_features, dtype=torch.bfloat16)
-        )
+        self.weight = nn.Parameter(torch.randn(out_features, in_features))
         if bias:
-            self.bias = nn.Parameter(torch.zeros(out_features, dtype=torch.bfloat16))
+            self.bias = nn.Parameter(torch.zeros(out_features))
         else:
             self.register_parameter("bias", None)
 
         self.act_quant = None
         self.weight_quant = None
-        self.w_q_cache = None
-
-    def _setup_status(self, already_quantized_w, already_quantized_a):
-        if self.act_quant:
-            if already_quantized_a:
-                self.act_quant.disable_observer()
-            else:
-                self.act_quant.enable_observer()
-        if self.weight_quant:
-            if already_quantized_w:
+
+    def freeze_weight(self):
+        """PTQ Core: Observe current weights, calculate and fix Scale/ZP"""
+        if self.weight_quant is not None:
+            # Compatible with official FakeQuantize module
+            if (
+                isinstance(self.weight_quant, FakeQuantize)
+                and self.weight_quant is not None
+            ):
+                _ = self.weight_quant(self.weight)
                 self.weight_quant.disable_observer()
-            else:
-                self.weight_quant.enable_observer()
-
-    def _clear_cache(self):
-        self.w_q_cache = None
-
-
-class QLinearW8A16_PerChannelSym_PerTensorSym(QLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        bias=True,
-        already_quantized_weight=False,
-        already_quantized_activation=False,
-    ):
+                s = self.weight_quant.scale
+                print(
+                    f"[{self.__class__.__name__}] Scale Shape: {list(s.shape)}, "
+                    f"scale[:3]: {s.flatten()[:3].tolist()}"
+                )
+            # Compatible with custom LPBQ logic
+            elif hasattr(self.weight_quant, "freeze"):
+                self.weight_quant.freeze(self.weight.detach())
+                s = self.weight_quant.scale_2_fp32
+                if s is not None:
+                    print(
+                        f"[{self.__class__.__name__}] LPBQ L2 Scale Shape: {list(s.shape)}, "
+                        f"scale[:3]: {s.flatten()[:3].tolist()}"
+                    )
+
+    def forward(self, x):
+        raise NotImplementedError
+
+
+# --- 1. W8A16 Per-Channel Scheme ---
+class QLinearW8A16_PerChannelSym(QLinear):
+    def __init__(self, in_features, out_features, bias=True):
         super().__init__(in_features, out_features, bias)
 
+        # Weight: Int8 Per-Channel symmetric
         self.weight_quant = FakeQuantize(
-            observer=PerChannelMinMaxObserver,
+            observer=PerChannelMinMaxObserver.with_args(
+                qscheme=torch.per_channel_symmetric,
+                dtype=torch.qint8,
+                ch_axis=0,  # Quantize output channels
+            ),
             quant_min=-128,
             quant_max=127,
-            dtype=torch.qint32,
+            dtype=torch.qint8,
             qscheme=torch.per_channel_symmetric,
-            ch_axis=0,
         )
-        self._setup_status(already_quantized_weight, already_quantized_activation)
 
     def forward(self, x):
+        # Activation quantization logic (add act_quant here if needed)
         x_q = x
-        if self.w_q_cache is not None:
-            w_q = self.w_q_cache
-        else:
-            w_q = self.weight_quant(self.weight)
-            self.w_q_cache = w_q
+        # Apply fake quantization: use fixed scale if frozen, otherwise update in real-time
+        w_q = self.weight_quant(self.weight)
         return F.linear(x_q, w_q, self.bias)
 
 
-class QLinearLPBQ(QLinear):
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        bias=True,
-        block_size=64,
-        already_quantized_weight=False,
-        already_quantized_activation=False,
-    ):
-        super().__init__(in_features, out_features, bias)
+# --- 2. LPBQ (Double Quantization) Scheme ---
+class DoubleQuantizer(nn.Module):
+    """
+    Handles LPBQ double normalization logic to work like FakeQuantize
+    """
 
+    def __init__(self, block_size=64):
+        super().__init__()
         self.block_size = block_size
-        self.already_quantized_w = already_quantized_weight
-
-        # Define buffers to store quantization parameters
-        # Initially set to None, populated during first forward pass, or saved to state_dict
-        self.register_buffer("scale_2_fp32", None)  # Level 2 Scale (FP32/BF16)
-        self.register_buffer(
-            "scale_1_uint4", None
-        )  # Level 1 Scale Indices (Uint4 stored as Uint8)
-        self.register_buffer("weight_q", None)  # Weight Indices (Int4 stored as Int8)
-
-        self._setup_status(already_quantized_weight, already_quantized_activation)
-
-    def _fake_quant_weight_double(self, w):
-        """
-        Double quantization calculation (no STE, forward-only simulation)
-        And save quantization parameters to Buffer
-        """
+        self.register_buffer("is_frozen", torch.tensor(False))
+        self.register_buffer("scale_2_fp32", None)
+        self.register_buffer("scale_1_uint4", None)
+        self.register_buffer("weight_q", None)
+        self.w_recon_cached = None  # Cache dequantized weights for acceleration
+
+    def freeze(self, w):
+        # Run complete double quantization and store in buffer
+        self.w_recon_cached = self.quantize_dequantize(w, save_buffers=True)
+        self.is_frozen = torch.tensor(True)
+
+    def quantize_dequantize(self, w, save_buffers=False):
         out_channels, in_channels = w.shape
+        # 1. Padding handling
+        pad_len = (self.block_size - in_channels % self.block_size) % self.block_size
+        if pad_len > 0:
+            w = F.pad(w, (0, pad_len), "constant", 0)
 
-        # 1. Padding
-        padding = 0
-        if in_channels % self.block_size != 0:
-            padding = self.block_size - (in_channels % self.block_size)
-            w = F.pad(w, (0, padding), "constant", 0)
-
-        # Reshape: [Out, Num_Blocks, Block_Size]
         w_reshaped = w.view(out_channels, -1, self.block_size)
 
-        # =======================================================
-        # Level 1 Scale Calculation (Ideal FP32)
-        # =======================================================
-        w_int4_max = 7.0
-        # w_int4_min = -8.0
-
-        # [Out, Num_Blocks, 1]
-        w_abs_max = w_reshaped.abs().amax(dim=-1, keepdim=True)
-        scale_1_fp32 = w_abs_max / w_int4_max
-        scale_1_fp32 = torch.clamp(scale_1_fp32, min=1e-8)
-
-        # =======================================================
-        # Level 2 Scale Calculation & Level 1 Scale Quantization
-        # =======================================================
-        s_uint4_max = 15.0
-        s_uint4_min = 0.0
-
-        # Calculate Level 2 Scale (Per-Channel FP32) -> [Out, 1, 1]
-        scale_2_fp32 = scale_1_fp32.amax(dim=1, keepdim=True) / s_uint4_max
-        scale_2_fp32 = torch.clamp(scale_2_fp32, min=1e-8)
-
-        # Quantize Level 1 Scale: FP32 -> Uint4 Indices
-        scale_1_q = torch.round(scale_1_fp32 / scale_2_fp32)
-        scale_1_q = torch.clamp(scale_1_q, s_uint4_min, s_uint4_max)
-
-        # Dequantize Level 1 Scale
-        scale_1_recon = scale_1_q * scale_2_fp32
-
-        # =======================================================
-        # Apply Level 1 Quantization (Quantize Weights)
-        # =======================================================
-        w_int4_min = -8.0
-
-        # Quantize Weight: FP32 -> Int4 Indices
-        w_q = torch.round(w_reshaped / scale_1_recon)
-        w_q = torch.clamp(w_q, w_int4_min, w_int4_max)
-
-        # Dequantize Weight
-        w_recon = w_q * scale_1_recon
-
-        # =======================================================
-        # [NEW] Store Scales and Indices
-        # =======================================================
-        # Note: We store Indices here, typically converted to int8/uint8 to save space
-        # scale_2 itself is a floating-point number, kept as is
-        self.scale_2_fp32 = scale_2_fp32.detach()
-        # scale_1_q is 0-15, stored as uint8
-        self.scale_1_uint4 = scale_1_q.detach().to(torch.uint8)
-        # w_q is -8 to 7, stored as int8
-        self.weight_q = w_q.detach().to(torch.int8)
-
-        # =======================================================
-        # Restore Shape
-        # =======================================================
-        w_out = w_recon.view(out_channels, -1)
-        if padding > 0:
-            w_out = w_out[:, :-padding]
+        # Level 1: FP32 Scale
+        s1 = w_reshaped.abs().amax(dim=-1, keepdim=True) / 7.0
+        s1 = s1.clamp(min=1e-8)
 
-        return w_out.to(torch.bfloat16)
+        # Level 2: Quantize S1 to Uint4
+        s2 = s1.amax(dim=1, keepdim=True) / 15.0
+        s2 = s2.clamp(min=1e-8)
+        s1_q = (s1 / s2).round().clamp(0, 15)
+        s1_recon = s1_q * s2
 
-    def forward(self, x):
-        x_q = x
+        # Level 3: Quantize Weight to Int4
+        w_q = (w_reshaped / s1_recon).round().clamp(-8, 7)
+        w_recon = w_q * s1_recon
 
-        if self.w_q_cache is not None:
-            w_q = self.w_q_cache
-        else:
-            if self.already_quantized_w:
-                w_q = self.weight
-            else:
-                # Real-time calculation and update of self.scale_2, self.scale_1_idx, self.weight_idx
-                w_q = self._fake_quant_weight_double(self.weight)
+        if save_buffers:
+            self.scale_2_fp32 = s2.detach()
+            self.scale_1_uint4 = s1_q.detach().to(torch.uint8)
+            self.weight_q = w_q.detach().to(torch.int8)
 
-            if self.use_weight_cache:
-                self.w_q_cache = w_q
+        # Restore shape
+        w_out = w_recon.view(out_channels, -1)
+        if pad_len > 0:
+            w_out = w_out[:, :-pad_len]
+        return w_out
+
+    def forward(self, w):
+        if self.is_frozen:
+            # If frozen, directly return cached reconstructed weights (or real-time dequantization from Buffer)
+            if self.w_recon_cached is None:
+                # Logic to reconstruct from weight_q + scale_1 + scale_2 can be written here
+                pass
+            return (
+                self.w_recon_cached
+                if self.w_recon_cached is not None
+                else self.quantize_dequantize(w)
+            )
+        return self.quantize_dequantize(w)
 
-        return F.linear(x_q, w_q, self.bias)
+
+class QLinearLPBQ(QLinear):
+    def __init__(self, in_features, out_features, bias=True, block_size=64):
+        super().__init__(in_features, out_features, bias)
+        self.weight_quant = DoubleQuantizer(block_size)
+
+    def forward(self, x):
+        # Must use quantized weights w_q for computation
+        w_q = self.weight_quant(self.weight)
+        return F.linear(x, w_q, self.bias)
diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
index eb9ec8d88..5606dafaa 100644
--- a/pymllm/backends/qualcomm/transformers/core/rms_norm.py
+++ b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
@@ -1,72 +1,67 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.ao.quantization import FakeQuantize, MinMaxObserver
 
 
 class QRMSNorm(nn.Module):
-    """
-    RMSNorm with int16 per-tensor symmetric quantized weight.
-
-    This implementation applies quantization to the weight tensor only,
-    using per-tensor symmetric quantization with int16 range.
-    """
-
     def __init__(
         self,
         normalized_shape,
         eps=1e-6,
-        elementwise_affine=True,
-        already_quantized_weight=False,
+        quant_bits=16,
     ):
         super().__init__()
-
+        self.eps = eps
         if isinstance(normalized_shape, int):
             normalized_shape = (normalized_shape,)
-        self.normalized_shape = tuple(normalized_shape)
-        self.eps = eps
-        self.already_quantized_w = already_quantized_weight
 
-        if elementwise_affine:
-            self.weight = nn.Parameter(
-                torch.ones(normalized_shape, dtype=torch.bfloat16)
-            )
-        else:
-            self.register_parameter("weight", None)
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
 
-        # Weight quantization for int16 per-tensor symmetric
-        self.weight_quant = FakeQuantize(
-            observer=MinMaxObserver,
-            quant_min=-32768,
-            quant_max=32767,
+        # Quantization configuration for Weight
+        self.weight_fake_quant = FakeQuantize(
+            observer=MinMaxObserver.with_args(
+                qscheme=torch.per_tensor_symmetric, dtype=torch.qint32
+            ),
+            quant_min=-(2 ** (quant_bits - 1)),
+            quant_max=2 ** (quant_bits - 1) - 1,
             dtype=torch.qint32,
             qscheme=torch.per_tensor_symmetric,
         )
 
-        self.w_q_cache = None
-        self.use_weight_cache = already_quantized_weight
-
-    def _clear_cache(self):
-        self.w_q_cache = None
-
     def forward(self, x):
-        # Compute RMS norm
-        variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        x = x * torch.rsqrt(variance + self.eps)
+        # 1. RMSNorm basic logic (using float32 to ensure stability)
+        input_dtype = x.dtype
+        x_fp32 = x.float()
+        variance = x_fp32.pow(2).mean(-1, keepdim=True)
+        x_normed = x_fp32 * torch.rsqrt(variance + self.eps)
 
-        # Apply quantized weight
-        if self.weight is not None:
-            if self.w_q_cache is not None:
-                w_q = self.w_q_cache
-            else:
-                if self.already_quantized_w:
-                    w_q = self.weight
-                else:
-                    w_q = self.weight_quant(self.weight)
+        # 2. Weight fake quantization
+        # If observer is not closed, this step will continuously update scale/zp
+        # If freeze_weight() is called, this will just use fixed scale/zp for quantization
+        w_q = self.weight_fake_quant(self.weight)
 
-                if self.use_weight_cache:
-                    self.w_q_cache = w_q
+        return (x_normed * w_q).to(input_dtype)
+
+    @torch.no_grad()
+    def freeze_weight(self):
+        """
+        Manually trigger Observer to observe and calculate scale, then lock it.
+        Solve the problem of output being 0 on first run.
+        """
+        self.weight_fake_quant.activation_post_process(self.weight)
+        s, zp = self.weight_fake_quant.activation_post_process.calculate_qparams()
+        self.weight_fake_quant.scale.copy_(s)
+        self.weight_fake_quant.zero_point.copy_(zp)
+        self.weight_fake_quant.disable_observer()
+        class_name = self.__class__.__name__
+        instance_class_name = type(self).__name__
+        print(
+            f"Class: {class_name}, Instance: {instance_class_name}, Weight Quantized: scale={self.weight_fake_quant.scale}, zp={self.weight_fake_quant.zero_point}"
+        )
 
-            x = x * w_q
+    def disable_quant(self):
+        """Completely turn off quantization noise and return to floating point mode"""
+        self.weight_fake_quant.disable_fakequant()
 
-        return x
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
new file mode 100644
index 000000000..5918b5d85
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
@@ -0,0 +1,687 @@
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import (
+    create_causal_mask,
+    create_sliding_window_causal_mask,
+)
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+
+# Replace linear, rms_norm with:
+from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm
+from pymllm.backends.qualcomm.transformers.core.qlinear import (
+    QLinearLPBQ,
+    QLinearW8A16_PerChannelSym,
+)
+from pymllm.backends.qualcomm.transformers.core.qdq import ActivationQDQ
+
+
+class Qwen3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = QLinearLPBQ(
+            self.hidden_size, self.intermediate_size, bias=False, block_size=32
+        )
+        self.up_proj = QLinearLPBQ(
+            self.hidden_size, self.intermediate_size, bias=False, block_size=32
+        )
+        self.down_proj = QLinearLPBQ(
+            self.intermediate_size, self.hidden_size, bias=False, block_size=32
+        )
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        # QDQ
+        self.up_proj_input_qdq = ActivationQDQ(bits=16)
+        self.up_proj_output_qdq = ActivationQDQ(bits=16)
+        self.gate_proj_output_qdq = ActivationQDQ(bits=16)
+        self.act_output_qdq = ActivationQDQ(bits=16)
+        self.down_proj_input_qdq = ActivationQDQ(bits=16)
+
+    def forward(self, x):
+        x = self.up_proj_input_qdq(x)
+        up_result = self.up_proj_output_qdq(self.up_proj(x))
+        gate_result = self.gate_proj_output_qdq(self.gate_proj(x))
+        gate_result = self.act_output_qdq(self.act_fn(gate_result))
+        o = self.down_proj_input_qdq(gate_result * up_result)
+        o = self.down_proj(o)
+        return o
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.num_key_value_groups = (
+            config.num_attention_heads // config.num_key_value_heads
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = QLinearLPBQ(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.attention_bias,
+            block_size=32,
+        )
+        self.k_proj = QLinearLPBQ(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+            block_size=32,
+        )
+        self.v_proj = QLinearLPBQ(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.attention_bias,
+            block_size=32,
+        )
+        self.o_proj = QLinearLPBQ(
+            config.num_attention_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+            block_size=32,
+        )
+        self.q_norm = QRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps, quant_bits=16
+        )  # unlike olmo, only on the head dim!
+        self.k_norm = QRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps, quant_bits=16
+        )  # thus post q_norm does not need reshape
+        self.sliding_window = (
+            config.sliding_window
+            if config.layer_types[layer_idx] == "sliding_attention"
+            else None
+        )
+
+        # QDQ
+        self.q_proj_input_qdq = ActivationQDQ(bits=16)
+        self.q_norm_input_qdq = ActivationQDQ(bits=16)
+        self.q_norm_output_qdq = ActivationQDQ(bits=16)
+        self.k_norm_input_qdq = ActivationQDQ(bits=16)
+        self.k_norm_output_qdq = ActivationQDQ(bits=16)
+        self.q_rope_mul_0_output_qdq = ActivationQDQ(bits=16)
+        self.q_rope_mul_1_output_qdq = ActivationQDQ(bits=16)
+        self.q_rope_add_0_output_qdq = ActivationQDQ(bits=16)
+        self.k_rope_mul_0_output_qdq = ActivationQDQ(bits=16)
+        self.k_rope_mul_1_output_qdq = ActivationQDQ(bits=16)
+        self.k_rope_add_0_output_qdq = ActivationQDQ(bits=16)
+        self.k_cast_to_int8_qdq = ActivationQDQ(bits=8)
+        self.v_cast_to_int8_qdq = ActivationQDQ(bits=8)
+        self.qk_matmul_output_qdq = ActivationQDQ(bits=16)
+        self.scaling_qdq = ActivationQDQ(bits=16)
+        self.reduce_min_output_qdq = ActivationQDQ(bits=16)
+        self.minus_0_output_qdq = ActivationQDQ(bits=16)
+        self.softmax_output_qdq = ActivationQDQ(bits=16)
+        self.attn_value_matmul_output_qdq = ActivationQDQ(bits=16)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        hidden_states = self.q_proj_input_qdq(hidden_states)
+
+        query_states = self.q_norm(
+            self.q_norm_input_qdq(self.q_proj(hidden_states)).view(hidden_shape)
+        ).transpose(1, 2)
+        key_states = self.k_norm(
+            self.k_norm_input_qdq(self.k_proj(hidden_states)).view(hidden_shape)
+        ).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm_output_qdq(query_states)
+        key_states = self.k_norm_output_qdq(key_states)
+
+        cos, sin = position_embeddings
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        query_states = self.q_rope_add_0_output_qdq(
+            self.q_rope_mul_0_output_qdq(query_states * cos)
+            + self.q_rope_mul_1_output_qdq(rotate_half(query_states) * sin)
+        )
+        key_states = self.k_rope_add_0_output_qdq(
+            self.k_rope_mul_0_output_qdq(key_states * cos)
+            + self.k_rope_mul_1_output_qdq(rotate_half(key_states) * sin)
+        )
+
+        key_states = self.k_cast_to_int8_qdq(key_states)
+        value_states = self.v_cast_to_int8_qdq(value_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = self.qk_matmul_output_qdq(
+            torch.matmul(query_states, key_states.transpose(2, 3))
+        ) * self.scaling_qdq(
+            torch.ones(1, dtype=torch.bfloat16, device=value_states.device)
+            * self.scaling
+        )
+
+        attn_min = self.reduce_min_output_qdq(
+            torch.amin(attn_weights, dim=-1, keepdim=True)
+        )
+        attn_vv = self.minus_0_output_qdq(attn_min - 20)
+        attn_weights = torch.where(attention_mask == 0, attn_weights, attn_vv)
+
+        attn_weights = self.softmax_output_qdq(
+            nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+                query_states.dtype
+            )
+        )
+        attn_output = self.attn_value_matmul_output_qdq(
+            torch.matmul(attn_weights, value_states)
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Qwen3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = QRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, quant_bits=16
+        )
+        self.post_attention_layernorm = QRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, quant_bits=16
+        )
+        self.attention_type = config.layer_types[layer_idx]
+
+        # QDQ
+        self.input_layernorm_input_qdq = ActivationQDQ(bits=16)
+        self.add_0_lhs_input_qdq = ActivationQDQ(bits=16)
+        self.add_0_output_qdq = ActivationQDQ(bits=16)
+        self.add_1_lhs_input_qdq = ActivationQDQ(bits=16)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[
+            tuple[torch.Tensor, torch.Tensor]
+        ] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        hidden_states = self.input_layernorm_input_qdq(hidden_states)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = self.add_0_output_qdq(
+            residual + self.add_0_lhs_input_qdq(hidden_states)
+        )
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.add_1_lhs_input_qdq(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class Qwen3PreTrainedModel(PreTrainedModel):
+    config: Qwen3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Qwen3DecoderLayer,
+        "attentions": Qwen3Attention,
+    }
+
+
+class Qwen3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Qwen3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type")
+            )
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None]
+            .float()
+            .expand(position_ids.shape[0], -1, 1)
+            .to(x.device)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = (
+            x.device.type
+            if isinstance(x.device.type, str) and x.device.type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Qwen3Model(Qwen3PreTrainedModel):
+    def __init__(self, config: Qwen3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen3DecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = QRMSNorm(config.hidden_size, eps=config.rms_norm_eps, quant_bits=16)
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+
+        # Register sin and cos as buffers
+        self.register_buffer("mllm_max_sin_embedding", None)
+        self.register_buffer("mllm_max_cos_embedding", None)
+        self.sin_embedding_input_qdq = ActivationQDQ(bits=16)
+        self.cos_embedding_input_qdq = ActivationQDQ(bits=16)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = (
+                    create_sliding_window_causal_mask(**mask_kwargs)
+                )
+
+        hidden_states = inputs_embeds
+
+        if self.mllm_max_sin_embedding is None and self.mllm_max_cos_embedding is None:
+            mllm_qualcomm_max_length = kwargs.get("mllm_qualcomm_max_length", None)
+            assert mllm_qualcomm_max_length is not None
+            max_position_ids = torch.arange(
+                0,
+                mllm_qualcomm_max_length,
+                dtype=position_ids.dtype,
+                device=position_ids.device,
+            ).unsqueeze(0)
+            self.mllm_max_cos_embedding, self.mllm_max_sin_embedding = self.rotary_emb(
+                hidden_states, max_position_ids
+            )
+            self.mllm_max_cos_embedding = self.cos_embedding_input_qdq(
+                self.mllm_max_cos_embedding
+            )
+            self.mllm_max_sin_embedding = self.sin_embedding_input_qdq(
+                self.mllm_max_sin_embedding
+            )
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = (
+            self.mllm_max_cos_embedding[:, position_ids.squeeze(0), :],
+            self.mllm_max_sin_embedding[:, position_ids.squeeze(0), :],
+        )
+
+        # Generate causal mask based on position_ids length
+        # For prefill, we need a lower triangular mask
+        _, seq_len = input_ids.shape
+        if seq_len != 1:
+            causal_mask = 1 - torch.tril(
+                torch.ones(seq_len, seq_len, dtype=torch.int8, device=input_ids.device)
+            )
+            # [1, 1, seq_len, seq_len]
+            causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+        else:
+            # [1, 1, seq_len, seq_len]
+            causal_mask = torch.zeros(
+                (1, 1, 1, seq_len), dtype=torch.int8, device=input_ids.device
+            )
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = QLinearW8A16_PerChannelSym(
+            config.hidden_size, config.vocab_size, bias=False
+        )
+        self.mllm_qualcomm_max_length = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
+
+        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        kwargs.update({"mllm_qualcomm_max_length": self.mllm_qualcomm_max_length})
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Qwen3ForSequenceClassification(
+    GenericForSequenceClassification, Qwen3PreTrainedModel
+):
+    pass
+
+
+class Qwen3ForTokenClassification(GenericForTokenClassification, Qwen3PreTrainedModel):
+    pass
+
+
+class Qwen3ForQuestionAnswering(GenericForQuestionAnswering, Qwen3PreTrainedModel):
+    base_model_prefix = (
+        "transformer"  # For BC, where `transformer` was used instead of `model`
+    )
+
+
+__all__ = [
+    "Qwen3ForCausalLM",
+    "Qwen3ForQuestionAnswering",
+    "Qwen3PreTrainedModel",
+    "Qwen3Model",
+    "Qwen3ForSequenceClassification",
+    "Qwen3ForTokenClassification",
+]
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
new file mode 100644
index 000000000..c2aed54c8
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
@@ -0,0 +1,155 @@
+import torch
+from tqdm import tqdm
+from modelscope.msdatasets import MsDataset
+from transformers import AutoTokenizer
+from pymllm.backends.qualcomm.transformers.core.qdq import ActivationQDQ
+from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm
+from pymllm.backends.qualcomm.transformers.core.qlinear import (
+    QLinearLPBQ,
+    QLinearW8A16_PerChannelSym,
+)
+from pymllm.backends.qualcomm.transformers.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+
+
+def freeze_qwen3_rmsnorm_weight(m):
+    if isinstance(m, QRMSNorm):
+        m.freeze_weight()
+
+
+def freeze_qwen3_linear_weight(m):
+    if isinstance(m, QLinearLPBQ) or isinstance(m, QLinearW8A16_PerChannelSym):
+        m.freeze_weight()
+
+
+def disable_qdq_observer(m):
+    if isinstance(m, ActivationQDQ):
+        m.disable_observer()
+
+
+def enable_qdq_observer(m):
+    if isinstance(m, ActivationQDQ):
+        m.enable_observer()
+
+
+class Qwen3Quantizer:
+    def __init__(self, model_path: str, mllm_qualcomm_max_length=2048):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = Qwen3ForCausalLM.from_pretrained(
+            model_path,
+            attn_implementation="eager",
+        )
+        self.mllm_qualcomm_max_length = mllm_qualcomm_max_length
+        self.model.mllm_qualcomm_max_length = mllm_qualcomm_max_length
+
+        # PTQ All Weights.
+        self.model.apply(freeze_qwen3_rmsnorm_weight)
+        self.model.apply(freeze_qwen3_linear_weight)
+        print("All PTQ weights preparation done.")
+
+    def freeze_activation(self):
+        self.model.apply(disable_qdq_observer)
+
+    def enable_activation_update(self):
+        self.model.apply(enable_qdq_observer)
+
+    def infer(self, prompt: str):
+        messages = [{"role": "user", "content": prompt}]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False,  # Switches between thinking and non-thinking modes. Default is True.
+        )
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
+
+        # conduct text completion
+        generated_ids = self.model.generate(
+            **model_inputs,
+            max_new_tokens=self.mllm_qualcomm_max_length
+            - len(model_inputs.input_ids[0])
+            - 1,
+            do_sample=False,
+            temperature=None,
+            top_p=None,
+            top_k=None,
+        )
+        output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :].tolist()
+
+        # parsing thinking content
+        try:
+            # rindex finding 151668 (</think>)
+            index = len(output_ids) - output_ids[::-1].index(151668)
+        except ValueError:
+            index = 0
+
+        thinking_content = self.tokenizer.decode(
+            output_ids[:index], skip_special_tokens=True
+        ).strip("\n")
+        content = self.tokenizer.decode(
+            output_ids[index:], skip_special_tokens=True
+        ).strip("\n")
+
+        print("thinking content:", thinking_content)
+        print("content:", content)
+
+    def calibrate(self, num_samples=64, max_seq_length=512):
+        """
+        Perform calibration using Wikipedia dataset (PTQ)
+        :param num_samples: Number of samples for calibration
+        :param max_seq_length: Maximum length for each sample (not exceeding mllm_qualcomm_max_length)
+        """
+        print(
+            f"Starting calibration, samples: {num_samples}, max length: {max_seq_length}"
+        )
+
+        # 1. Enable QDQ Observer for activation values
+        self.enable_activation_update()
+        self.model.eval()
+
+        # 2. Load Wikipedia dataset (English version example)
+        # Use streaming=True to download and process on the fly, without downloading the full几十G dataset
+        dataset = MsDataset.load(
+            "modelscope/wikitext",
+            subset_name="wikitext-103-v1",
+            split="train",
+            trust_remote_code=True,
+        )
+
+        # 3. Execute forward pass (Prefill stage)
+        samples_processed = 0
+
+        # Ensure no gradient calculation during inference
+        with torch.no_grad():
+            pbar = tqdm(total=num_samples, desc="Calibrating")
+            for entry in dataset:
+                if samples_processed >= num_samples:
+                    break
+
+                messages = [{"role": "user", "content": entry["text"]}]
+                text = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                    enable_thinking=False,  # Switches between thinking and non-thinking modes. Default is True.
+                )
+                model_inputs = self.tokenizer([text], return_tensors="pt").to(
+                    self.model.device
+                )
+
+                # Only need Prefill stage: directly call forward
+                # This will trigger observer update statistics in ActivationQDQ
+                self.model.generate(
+                    **model_inputs,
+                    max_new_tokens=1,
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    top_k=None,
+                )
+
+                samples_processed += 1
+                pbar.update(1)
+
+        # 4. Close Observer, freeze calibrated quantization parameters
+        self.freeze_activation()
+        print("\nCalibration completed, activation quantization parameters frozen.")
diff --git a/pymllm/backends/qualcomm/transformers/static_qwen3.py b/pymllm/backends/qualcomm/transformers/static_qwen3.py
index becbe9f48..186e312ca 100644
--- a/pymllm/backends/qualcomm/transformers/static_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/static_qwen3.py
@@ -7,37 +7,7 @@
     QLinearLPBQ,
     QLinearW8A16_PerChannelSym_PerTensorSym,
 )
-
-
-# This settings below is for Qwen1.7B
-class Qwen3Config:
-    def __init__(self):
-        self.attention_bias = False
-        self.attention_dropout = 0.0
-        self.bos_token_id = 151643
-        self.eos_token_id = 151645
-        self.head_dim = 128
-        self.hidden_act = "silu"
-        self.hidden_size = 2048
-        self.initializer_range = 0.02
-        self.intermediate_size = 6144
-        self.max_position_embeddings = 40960
-        self.max_window_layers = 28
-        self.model_type = "qwen3"
-        self.num_attention_heads = 16
-        self.num_hidden_layers = 28
-        self.num_key_value_heads = 8
-        self.pad_token_id = 151643
-        self.rms_norm_eps = 1e-06
-        self.rope_scaling = None
-        self.rope_theta = 1000000
-        self.sliding_window = None
-        self.tie_word_embeddings = True
-        self.torch_dtype = "bfloat16"
-        self.transformers_version = "4.51.0"
-        self.use_cache = True
-        self.use_sliding_window = False
-        self.vocab_size = 151936
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 
 def generate_rope_cache(
@@ -153,6 +123,13 @@ def __init__(self, config):
         self.qdq_act = QDQ_OP["A16-PerTensor"]()
         self.qdq_middle = QDQ_OP["A16-PerTensor"]()
 
+    def freeze_observer(self):
+        for name, value in self.__dict__.items():
+            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
+                value, QDQ_OP["A8-PerTensor"]
+            ):
+                value.disable_observer()
+
     def forward(self, x):
         """
         input:
@@ -163,7 +140,7 @@ def forward(self, x):
         x = self.qdq_x(x)
         up_result = self.qdq_up_result(self.up_proj(x))
         gate_result = self.qdq_gate_result(self.gate_proj(x))
-        up_result = self.qdq_act(self.act_fn(up_result))
+        gate_result = self.qdq_act(self.act_fn(gate_result))
         o = self.qdq_middle(gate_result * up_result)
         o = self.down_proj(o)
         return o
@@ -236,6 +213,13 @@ def __init__(self, config, layer_idx: int):
         self.k_cache = None
         self.v_cache = None
 
+    def freeze_observer(self):
+        for name, value in self.__dict__.items():
+            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
+                value, QDQ_OP["A8-PerTensor"]
+            ):
+                value.disable_observer()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -258,6 +242,7 @@ def forward(
         query_states = (
             self.q_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
         )
+
         key_states = (
             self.k_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
         )
@@ -289,6 +274,9 @@ def forward(
             + self.qdq_rope_5(rot_k * sin_embedding)
         )
 
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
         key_states = self.qdq_4(key_states)
         # [B, H, D, S]
         key_states = key_states.transpose(2, 3)
@@ -307,12 +295,11 @@ def forward(
             self.k_cache = key_states
             self.v_cache = value_states
 
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
         attn = query_states @ key_states
         attn = self.qdq_5(attn)
-        attn = attn / self.qdq_6(torch.ones(1, dtype=torch.bfloat16) * self.scaling)
+        attn = attn / self.qdq_6(
+            torch.ones(1, dtype=torch.bfloat16, device=attn.device) * self.scaling
+        )
         attn = self.qdq_7(attn)
         attn_min = torch.amin(attn, dim=-1, keepdim=True)
         attn_min = self.qdq_8(attn_min)
@@ -320,12 +307,17 @@ def forward(
         attn_vv = self.qdq_9(attn_vv)
         attn = torch.where(causal_mask == 0, attn, attn_vv)
         attn = self.qdq_10(attn)
-        attn = F.softmax(attn, -1)
+        attn = F.softmax(attn.to(torch.float32), -1).to(torch.bfloat16)
+        print(attn)
+        exit(0)
         attn = self.qdq_11(attn)
         y = attn @ value_states
         y = self.qdq_12(y)
         y = y.transpose(1, 2).reshape(bsz, seq_len, -1)
         y = self.o_proj(y)
+        print(y.shape)
+        print(y)
+        exit(0)
         return y
 
 
@@ -345,6 +337,17 @@ def __init__(self, config, layer_idx: int):
         self.qdq_2 = QDQ_OP["A16-PerTensor"]()
         self.qdq_3 = QDQ_OP["A16-PerTensor"]()
 
+    def freeze_observer(self):
+        self.mlp.freeze_observer()
+        self.self_attn.freeze_observer()
+        self.input_layernorm.freeze_observer()
+        self.post_attention_layernorm.freeze_observer()
+        for name, value in self.__dict__.items():
+            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
+                value, QDQ_OP["A8-PerTensor"]
+            ):
+                value.disable_observer()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -396,8 +399,18 @@ def __init__(self, config):
         self.norm = QRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.qdq_0 = QDQ_OP["A16-PerTensor"]()
 
+    def freeze_observer(self):
+        self.norm.freeze_observer()
+        for item in self.layers:
+            item.freeze_observer()
+        for name, value in self.__dict__.items():
+            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
+                value, QDQ_OP["A8-PerTensor"]
+            ):
+                value.disable_observer()
+
     def forward(self, input_ids, sin, cos, causal_mask):
-        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embed_tokens(input_ids).to(torch.bfloat16)
         hidden_states = inputs_embeds
 
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
@@ -407,8 +420,9 @@ def forward(self, input_ids, sin, cos, causal_mask):
         return hidden_states
 
 
-class Qwen3ForCausalLM:
+class Qwen3ForCausalLM(nn.Module):
     def __init__(self, config):
+        super().__init__()
         self.config = config
         self.model = Qwen3Model(config)
         self.vocab_size = config.vocab_size
@@ -423,6 +437,22 @@ def __init__(self, config):
         self.register_buffer("sin", None)
         self.register_buffer("cos", None)
 
+    def freeze_observer(self):
+        self.model.freeze_observer()
+        for name, value in self.__dict__.items():
+            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
+                value, QDQ_OP["A8-PerTensor"]
+            ):
+                value.disable_observer()
+
+    def disable_fakequant(self):
+        # self.model.disable_fakequant()
+        for name, value in self.__dict__.items():
+            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
+                value, QDQ_OP["A8-PerTensor"]
+            ):
+                value.disable_fakequant()
+
     def forward(
         self,
         input_ids,
@@ -466,18 +496,134 @@ def forward(
         logits = self.lm_head(self.qdq_0(out))
         return logits
 
-    def _update_kv_cache_by_copy(self):
-        pass
 
-    def _freeze_observer(self):
-        pass
+class Qwen3Quantizer:
+    def __init__(self):
+        # Other stuff
+        self.tokenizer: AutoTokenizer = None
+        self.model: Qwen3ForCausalLM = None
+        self.config: AutoConfig = None
+
+    def load_from_hf(self, model_path: str, verbose: bool = False):
+        self.config = AutoConfig.from_pretrained(model_path)
+        state_dict = AutoModelForCausalLM.from_pretrained(model_path).state_dict()
+        self.model = Qwen3ForCausalLM(self.config)
+
+        # Check if all original weight is in state_dict
+        model_keys = set(self.model.state_dict().keys())
+        loaded_keys = set(state_dict.keys())
+
+        # 1. Keys present in model but missing in state_dict
+        missing_keys = model_keys - loaded_keys
+        if missing_keys and verbose:
+            print(
+                f"\n⚠️  Keys present in model but missing in state_dict ({len(missing_keys)} keys):"
+            )
+            for k in sorted(missing_keys):
+                print(f"   - {k}")
+
+        # 2. Keys present in state_dict but unexpected in model
+        unexpected_keys = loaded_keys - model_keys
+        if unexpected_keys:
+            print(
+                f"\n⚠️  Keys present in state_dict but unexpected in model ({len(unexpected_keys)} keys):"
+            )
+            for k in sorted(unexpected_keys):
+                print(f"   - {k}")
 
-    def infer(self, model_path: str, prompt: str, max_length) -> str:
-        pass
+        self.model.load_state_dict(state_dict, strict=False)
+        self.model.cuda()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
-    def calibrate(self, model_path: str, dataset_path: str):
+    def infer(self, prompt: str, enable_fake_quant: bool = True) -> str:
         """
-        calibrate Only on PREFILL stage !!!
+        Generate response for the given prompt.
+
+        Args:
+            prompt: Input text prompt
+
+        Returns:
+            Generated text response
+        """
+        # Tokenize the input prompt
+        self.model.freeze_observer()
+        if not enable_fake_quant:
+            self.model.disable_fakequant()
+        if hasattr(self.tokenizer, "chat_template") and self.tokenizer.chat_template:
+            formatted_prompt = self.tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False,
+            )
+        else:
+            formatted_prompt = prompt
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
+        input_ids = inputs["input_ids"]
+        seq_len = input_ids.shape[1]
+
+        # Initialize position_ids
+        position_ids = torch.arange(seq_len, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0)  # Add batch dimension
+
+        # Get max_length from config or use a default value
+        max_length = getattr(self.config, "max_position_embeddings", 2048)
+
+        # TODO remove this
+        max_length = 8
+
+        # Prefill stage: process the prompt and build KV cache
+        with torch.no_grad():
+            logits = self.model(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                max_length=2048,
+            )
+
+        # Get the last token from prefill as the first generated token
+        next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
+        generated_tokens = next_token.clone()
+
+        # Decode stage: generate tokens one by one using KV cache
+        while generated_tokens.shape[1] < max_length:
+            # Update position_ids for the new token
+            new_position_id = position_ids[:, -1] + 1
+            position_ids = new_position_id.unsqueeze(0)
+
+            with torch.no_grad():
+                logits = self.model(
+                    input_ids=next_token,
+                    position_ids=position_ids,
+                    max_length=max_length,
+                )
+
+            # Get next token (greedy decoding)
+            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
+
+            # Append generated token
+            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
+
+            # Stop if EOS token is generated
+            if next_token.item() == self.tokenizer.eos_token_id:
+                break
+
+        # Decode generated tokens to text
+        generated_text = self.tokenizer.decode(
+            generated_tokens[0], skip_special_tokens=True
+        )
+
+        return generated_text
+
+    def calibrate(self, dataset_path: str):
+        """
+        Calibrate Only on PREFILL stage !!!
         """
         # Call infer after calibrate done.
         pass
+
+
+if __name__ == "__main__":
+    quantizer = Qwen3Quantizer()
+    quantizer.load_from_hf("/mnt/user-ssd/shared_models/Qwen3-1.7B/")
+    result = quantizer.infer("hello")
+    print(result)
diff --git a/pymllm/backends/qualcomm/transformers/train.py b/pymllm/backends/qualcomm/transformers/train.py
index e69de29bb..a36416a44 100644
--- a/pymllm/backends/qualcomm/transformers/train.py
+++ b/pymllm/backends/qualcomm/transformers/train.py
@@ -0,0 +1,6 @@
+from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer
+
+if __name__ == "__main__":
+    m = Qwen3Quantizer()
+    m.calibrate()
+    m.infer("简述中国断代史")
diff --git a/requirements-qnn-aot.txt b/requirements-qnn-aot.txt
new file mode 100644
index 000000000..f3f435c9f
--- /dev/null
+++ b/requirements-qnn-aot.txt
@@ -0,0 +1 @@
+addict==2.4.0

From 9ea8f9de0d2fd45c5bdaa611c0d286b769c11361 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Sat, 3 Jan 2026 05:57:28 +0000
Subject: [PATCH 05/13] feat: update AOT Qualcomm Qwen3

---
 .../backends/qualcomm/transformers/README.md  |   7 +-
 .../transformers/qwen3/modeling_qwen3.py      |  28 +-
 .../qualcomm/transformers/qwen3/runner.py     |  17 +-
 .../qualcomm/transformers/qwen3/train.py      |  37 ++
 .../qualcomm/transformers/static_qwen3.py     | 629 ------------------
 .../backends/qualcomm/transformers/train.py   |   6 -
 requirements-qnn-aot.txt                      |   3 +
 7 files changed, 77 insertions(+), 650 deletions(-)
 create mode 100644 pymllm/backends/qualcomm/transformers/qwen3/train.py
 delete mode 100644 pymllm/backends/qualcomm/transformers/static_qwen3.py
 delete mode 100644 pymllm/backends/qualcomm/transformers/train.py

diff --git a/pymllm/backends/qualcomm/transformers/README.md b/pymllm/backends/qualcomm/transformers/README.md
index 256c60ece..9d677a86f 100644
--- a/pymllm/backends/qualcomm/transformers/README.md
+++ b/pymllm/backends/qualcomm/transformers/README.md
@@ -1,3 +1,8 @@
-
 # Transformers Quantization for Qualcomm Backend
 
+## Qwen3
+
+```shell
+cd ./qwen3
+python train.py --model_path "/your/model/path/" --max_length 2048 --num_samples 128 --infer_text "为什么伟大不能被计划"
+```
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
index 5918b5d85..1fe04f14d 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
@@ -202,9 +202,11 @@ def __init__(self, config: Qwen3Config, layer_idx: int):
         self.k_rope_add_0_output_qdq = ActivationQDQ(bits=16)
         self.k_cast_to_int8_qdq = ActivationQDQ(bits=8)
         self.v_cast_to_int8_qdq = ActivationQDQ(bits=8)
+        self.v_cast_to_int16_qdq = ActivationQDQ(bits=16)
         self.qk_matmul_output_qdq = ActivationQDQ(bits=16)
         self.scaling_qdq = ActivationQDQ(bits=16)
         self.reduce_min_output_qdq = ActivationQDQ(bits=16)
+        self.mul_0_output_qdq = ActivationQDQ(bits=16)
         self.minus_0_output_qdq = ActivationQDQ(bits=16)
         self.softmax_output_qdq = ActivationQDQ(bits=16)
         self.attn_value_matmul_output_qdq = ActivationQDQ(bits=16)
@@ -248,7 +250,7 @@ def forward(
         )
 
         key_states = self.k_cast_to_int8_qdq(key_states)
-        value_states = self.v_cast_to_int8_qdq(value_states)
+        value_states = self.v_cast_to_int8_qdq(self.v_cast_to_int16_qdq(value_states))
 
         if past_key_values is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
@@ -260,11 +262,14 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = self.qk_matmul_output_qdq(
-            torch.matmul(query_states, key_states.transpose(2, 3))
-        ) * self.scaling_qdq(
-            torch.ones(1, dtype=torch.bfloat16, device=value_states.device)
-            * self.scaling
+        attn_weights = self.mul_0_output_qdq(
+            self.qk_matmul_output_qdq(
+                torch.matmul(query_states, key_states.transpose(2, 3))
+            )
+            * self.scaling_qdq(
+                torch.ones(1, dtype=torch.bfloat16, device=value_states.device)
+                * self.scaling
+            )
         )
 
         attn_min = self.reduce_min_output_qdq(
@@ -444,6 +449,7 @@ def __init__(self, config: Qwen3Config):
         self.register_buffer("mllm_max_cos_embedding", None)
         self.sin_embedding_input_qdq = ActivationQDQ(bits=16)
         self.cos_embedding_input_qdq = ActivationQDQ(bits=16)
+        self.norm_input_qdq = ActivationQDQ(bits=16)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -560,7 +566,7 @@ def forward(
                 **kwargs,
             )
 
-        hidden_states = self.norm(hidden_states)
+        hidden_states = self.norm(self.norm_input_qdq(hidden_states))
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
@@ -582,6 +588,9 @@ def __init__(self, config):
         )
         self.mllm_qualcomm_max_length = None
 
+        self.lm_head_input_qdq = ActivationQDQ(bits=16)
+        self.lm_head_output_qdq = ActivationQDQ(bits=16)
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -641,7 +650,10 @@ def forward(
             if isinstance(logits_to_keep, int)
             else logits_to_keep
         )
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = self.lm_head(
+            self.lm_head_input_qdq(hidden_states[:, slice_indices, :])
+        )
+        logits = self.lm_head_output_qdq(logits)
 
         loss = None
         if labels is not None:
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
index c2aed54c8..0b4462f2b 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
@@ -65,9 +65,7 @@ def infer(self, prompt: str):
         # conduct text completion
         generated_ids = self.model.generate(
             **model_inputs,
-            max_new_tokens=self.mllm_qualcomm_max_length
-            - len(model_inputs.input_ids[0])
-            - 1,
+            max_new_tokens=128 - len(model_inputs.input_ids[0]) - 1,
             do_sample=False,
             temperature=None,
             top_p=None,
@@ -125,6 +123,9 @@ def calibrate(self, num_samples=64, max_seq_length=512):
                 if samples_processed >= num_samples:
                     break
 
+                if len(entry["text"].strip()) < 1024:
+                    continue
+
                 messages = [{"role": "user", "content": entry["text"]}]
                 text = self.tokenizer.apply_chat_template(
                     messages,
@@ -132,9 +133,13 @@ def calibrate(self, num_samples=64, max_seq_length=512):
                     add_generation_prompt=True,
                     enable_thinking=False,  # Switches between thinking and non-thinking modes. Default is True.
                 )
-                model_inputs = self.tokenizer([text], return_tensors="pt").to(
-                    self.model.device
-                )
+                model_inputs = self.tokenizer(
+                    [text],
+                    return_tensors="pt",
+                    max_length=max_seq_length,
+                    truncation=True,
+                    padding=False,
+                ).to(self.model.device)
 
                 # Only need Prefill stage: directly call forward
                 # This will trigger observer update statistics in ActivationQDQ
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py
new file mode 100644
index 000000000..81e452903
--- /dev/null
+++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py
@@ -0,0 +1,37 @@
+import argparse
+from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen3 Quantizer for Qualcomm backend")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="Qwen3-1.7B",
+        help="Path to the Qwen3 model directory",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=2048,
+        help="Maximum sequence length for quantization",
+    )
+    parser.add_argument(
+        "--num_samples", type=int, default=128, help="Number of samples for calibration"
+    )
+    parser.add_argument(
+        "--infer_text",
+        type=str,
+        default="为什么伟大不能被计划",
+        help="Text to run inference on",
+    )
+
+    args = parser.parse_args()
+
+    m = Qwen3Quantizer(args.model_path, mllm_qualcomm_max_length=args.max_length)
+    m.calibrate(num_samples=args.num_samples, max_seq_length=args.max_length)
+    m.infer(args.infer_text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pymllm/backends/qualcomm/transformers/static_qwen3.py b/pymllm/backends/qualcomm/transformers/static_qwen3.py
deleted file mode 100644
index 186e312ca..000000000
--- a/pymllm/backends/qualcomm/transformers/static_qwen3.py
+++ /dev/null
@@ -1,629 +0,0 @@
-import torch
-from torch import nn
-from torch.nn import functional as F
-from pymllm.backends.qualcomm.transformers.core.qdq import QDQ_OP
-from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm
-from pymllm.backends.qualcomm.transformers.core.qlinear import (
-    QLinearLPBQ,
-    QLinearW8A16_PerChannelSym_PerTensorSym,
-)
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-
-
-def generate_rope_cache(
-    max_length: int,
-    head_dim: int,
-    rope_theta: float,
-    dtype=torch.bfloat16,
-    device="cpu",
-):
-    """
-    Generate RoPE (Rotary Position Embedding) cache for given max_length.
-
-    Args:
-        max_length: Maximum sequence length
-        head_dim: Dimension of each attention head
-        rope_theta: RoPE theta parameter (frequency base)
-        dtype: Data type for the embeddings
-        device: Device to place the embeddings on
-
-    Returns:
-        tuple: (cos, sin) embeddings of shape [max_length, head_dim]
-    """
-    inv_freq = 1.0 / (
-        rope_theta
-        ** (torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) / head_dim)
-    )
-    t = torch.arange(max_length, dtype=torch.float32, device=device)
-    freqs = torch.einsum("i,j->ij", t, inv_freq)
-    emb = torch.cat((freqs, freqs), dim=-1)
-    cos = emb.cos().to(dtype)
-    sin = emb.sin().to(dtype)
-    return cos, sin
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class Qwen3MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = QLinearLPBQ(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            block_size=32,
-        )
-        self.up_proj = QLinearLPBQ(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            block_size=32,
-        )
-        self.down_proj = QLinearLPBQ(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            block_size=32,
-        )
-        self.act_fn = nn.SiLU()
-
-        # QDQ
-        self.qdq_x = QDQ_OP["A16-PerTensor"]()
-        self.qdq_up_result = QDQ_OP["A16-PerTensor"]()
-        self.qdq_gate_result = QDQ_OP["A16-PerTensor"]()
-        self.qdq_act = QDQ_OP["A16-PerTensor"]()
-        self.qdq_middle = QDQ_OP["A16-PerTensor"]()
-
-    def freeze_observer(self):
-        for name, value in self.__dict__.items():
-            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
-                value, QDQ_OP["A8-PerTensor"]
-            ):
-                value.disable_observer()
-
-    def forward(self, x):
-        """
-        input:
-            x: bf16, w/o fakequant
-        output:
-            o: bf16, w/o fakequant
-        """
-        x = self.qdq_x(x)
-        up_result = self.qdq_up_result(self.up_proj(x))
-        gate_result = self.qdq_gate_result(self.gate_proj(x))
-        gate_result = self.qdq_act(self.act_fn(gate_result))
-        o = self.qdq_middle(gate_result * up_result)
-        o = self.down_proj(o)
-        return o
-
-
-class Qwen3Attention(nn.Module):
-    def __init__(self, config, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(
-            config, "head_dim", config.hidden_size // config.num_attention_heads
-        )
-        self.num_key_value_groups = (
-            config.num_attention_heads // config.num_key_value_heads
-        )
-        self.scaling = self.head_dim**-0.5
-        self.q_proj = QLinearLPBQ(
-            config.hidden_size,
-            config.num_attention_heads * self.head_dim,
-            bias=False,
-            block_size=32,
-        )
-        self.k_proj = QLinearLPBQ(
-            config.hidden_size,
-            config.num_key_value_heads * self.head_dim,
-            bias=False,
-            block_size=32,
-        )
-        self.v_proj = QLinearLPBQ(
-            config.hidden_size,
-            config.num_key_value_heads * self.head_dim,
-            bias=False,
-            block_size=32,
-        )
-        self.o_proj = QLinearLPBQ(
-            config.num_attention_heads * self.head_dim,
-            config.hidden_size,
-            bias=False,
-            block_size=32,
-        )
-        self.q_norm = QRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-        self.k_norm = QRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-
-        # QDQ
-        self.qdq_hidden_states = QDQ_OP["A16-PerTensor"]()
-        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_1 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_2 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_3 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_4 = QDQ_OP["A8-PerTensor"]()
-        self.qdq_5 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_6 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_7 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_8 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_9 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_10 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_11 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_12 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_13 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_14 = QDQ_OP["A8-PerTensor"]()
-
-        self.qdq_rope_0 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_rope_1 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_rope_2 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_rope_3 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_rope_4 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_rope_5 = QDQ_OP["A16-PerTensor"]()
-
-        self.k_cache = None
-        self.v_cache = None
-
-    def freeze_observer(self):
-        for name, value in self.__dict__.items():
-            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
-                value, QDQ_OP["A8-PerTensor"]
-            ):
-                value.disable_observer()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        sin: torch.Tensor,
-        cos: torch.Tensor,
-        causal_mask: torch.Tensor,
-    ):
-        """
-        input:
-            hidden_states: bf16, w/o fakequant
-        output:
-            o: bf16, w/o fakequant
-        """
-        bsz, seq_len, _ = hidden_states.shape
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-        quantized_hidden_states = self.qdq_hidden_states(hidden_states)
-
-        # [B, H, S, D]
-        query_states = (
-            self.q_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
-        )
-
-        key_states = (
-            self.k_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
-        )
-        value_states = (
-            self.v_proj(quantized_hidden_states).view(hidden_shape).transpose(1, 2)
-        )
-
-        query_states = self.q_norm(self.qdq_0(query_states))
-        query_states = self.qdq_1(query_states)
-
-        key_states = self.k_norm(self.qdq_2(key_states))
-        key_states = self.qdq_3(key_states)
-
-        # ROPE Here
-        # cos = cos.unsqueeze(unsqueeze_dim)
-        # sin = sin.unsqueeze(unsqueeze_dim)
-        # q_embed = (q * cos) + (rotate_half(q) * sin)
-        # k_embed = (k * cos) + (rotate_half(k) * sin)
-        cos_embedding = cos.unsqueeze(1)
-        sin_embedding = sin.unsqueeze(1)
-        rot_q = rotate_half(query_states)
-        rot_k = rotate_half(key_states)
-        query_states = self.qdq_rope_0(
-            self.qdq_rope_1(query_states * cos_embedding)
-            + self.qdq_rope_2(rot_q * sin_embedding)
-        )
-        key_states = self.qdq_rope_3(
-            self.qdq_rope_4(key_states * cos_embedding)
-            + self.qdq_rope_5(rot_k * sin_embedding)
-        )
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        key_states = self.qdq_4(key_states)
-        # [B, H, D, S]
-        key_states = key_states.transpose(2, 3)
-        # [B, H, S, D]
-        value_states = self.qdq_14(self.qdq_13(value_states))
-
-        # KV Cache Here
-        if seq_len > 1 and self.k_cache is not None and self.v_cache is not None:
-            self.k_cache = None
-            self.v_cache = None
-
-        if seq_len == 1:
-            self.k_cache = torch.cat([self.k_cache, key_states], dim=-1)
-            self.v_cache = torch.cat([self.v_cache, value_states], dim=2)
-        else:
-            self.k_cache = key_states
-            self.v_cache = value_states
-
-        attn = query_states @ key_states
-        attn = self.qdq_5(attn)
-        attn = attn / self.qdq_6(
-            torch.ones(1, dtype=torch.bfloat16, device=attn.device) * self.scaling
-        )
-        attn = self.qdq_7(attn)
-        attn_min = torch.amin(attn, dim=-1, keepdim=True)
-        attn_min = self.qdq_8(attn_min)
-        attn_vv = attn_min - 20
-        attn_vv = self.qdq_9(attn_vv)
-        attn = torch.where(causal_mask == 0, attn, attn_vv)
-        attn = self.qdq_10(attn)
-        attn = F.softmax(attn.to(torch.float32), -1).to(torch.bfloat16)
-        print(attn)
-        exit(0)
-        attn = self.qdq_11(attn)
-        y = attn @ value_states
-        y = self.qdq_12(y)
-        y = y.transpose(1, 2).reshape(bsz, seq_len, -1)
-        y = self.o_proj(y)
-        print(y.shape)
-        print(y)
-        exit(0)
-        return y
-
-
-class Qwen3DecodeLayer(nn.Module):
-    def __init__(self, config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
-        self.mlp = Qwen3MLP(config)
-        self.input_layernorm = QRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = QRMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-
-        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_1 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_2 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_3 = QDQ_OP["A16-PerTensor"]()
-
-    def freeze_observer(self):
-        self.mlp.freeze_observer()
-        self.self_attn.freeze_observer()
-        self.input_layernorm.freeze_observer()
-        self.post_attention_layernorm.freeze_observer()
-        for name, value in self.__dict__.items():
-            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
-                value, QDQ_OP["A8-PerTensor"]
-            ):
-                value.disable_observer()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        sin: torch.Tensor,
-        cos: torch.Tensor,
-        causal_mask: torch.Tensor,
-    ):
-        """
-        inputs:
-            hidden_states: bf16, w/o fakequant
-        outputs:
-            hidden_states: bf16, w/o fakequant
-        """
-        hidden_states = self.qdq_0(hidden_states)
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention
-        hidden_states = self.self_attn(
-            hidden_states,
-            sin,
-            cos,
-            causal_mask,
-        )
-        hidden_states = self.qdq_2(residual + self.qdq_1(hidden_states))
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + self.qdq_3(hidden_states)
-        return hidden_states
-
-
-class Qwen3Model(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.hidden_size, self.padding_idx
-        )
-        self.layers = nn.ModuleList(
-            [
-                Qwen3DecodeLayer(config, layer_idx)
-                for layer_idx in range(config.num_hidden_layers)
-            ]
-        )
-        self.norm = QRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
-
-    def freeze_observer(self):
-        self.norm.freeze_observer()
-        for item in self.layers:
-            item.freeze_observer()
-        for name, value in self.__dict__.items():
-            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
-                value, QDQ_OP["A8-PerTensor"]
-            ):
-                value.disable_observer()
-
-    def forward(self, input_ids, sin, cos, causal_mask):
-        inputs_embeds = self.embed_tokens(input_ids).to(torch.bfloat16)
-        hidden_states = inputs_embeds
-
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            hidden_states = decoder_layer(hidden_states, sin, cos, causal_mask)
-
-        hidden_states = self.norm(self.qdq_0(hidden_states))
-        return hidden_states
-
-
-class Qwen3ForCausalLM(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.model = Qwen3Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = QLinearW8A16_PerChannelSym_PerTensorSym(
-            config.hidden_size, config.vocab_size, bias=False
-        )
-        self.qdq_0 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_1 = QDQ_OP["A16-PerTensor"]()
-        self.qdq_2 = QDQ_OP["A16-PerTensor"]()
-
-        # Register sin and cos as buffers
-        self.register_buffer("sin", None)
-        self.register_buffer("cos", None)
-
-    def freeze_observer(self):
-        self.model.freeze_observer()
-        for name, value in self.__dict__.items():
-            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
-                value, QDQ_OP["A8-PerTensor"]
-            ):
-                value.disable_observer()
-
-    def disable_fakequant(self):
-        # self.model.disable_fakequant()
-        for name, value in self.__dict__.items():
-            if isinstance(value, QDQ_OP["A16-PerTensor"]) or isinstance(
-                value, QDQ_OP["A8-PerTensor"]
-            ):
-                value.disable_fakequant()
-
-    def forward(
-        self,
-        input_ids,
-        position_ids,
-        max_length,
-    ):
-        _, seq_len = input_ids.shape
-
-        # Generate causal mask based on position_ids length
-        # For prefill, we need a lower triangular mask
-        if seq_len != 1:
-            causal_mask = 1 - torch.tril(
-                torch.ones(seq_len, seq_len, dtype=torch.int8, device=input_ids.device)
-            )
-            # [1, 1, seq_len, seq_len]
-            causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
-        else:
-            # [1, 1, seq_len, seq_len]
-            causal_mask = torch.zeros(
-                (1, 1, 1, seq_len), dtype=torch.int8, device=input_ids.device
-            )
-
-        # Generate or use registered RoPE embeddings
-        if self.sin is None or self.cos is None or self.cos.shape[0] < max_length:
-            cos, sin = generate_rope_cache(
-                max_length,
-                head_dim=self.config.head_dim,
-                rope_theta=self.config.rope_theta,
-                dtype=torch.bfloat16,
-                device=input_ids.device,
-            )
-            # Register the generated embeddings
-            self.sin = self.qdq_1(sin)
-            self.cos = self.qdq_2(cos)
-
-        # Slice RoPE embeddings to current sequence length
-        cos = self.cos[position_ids]
-        sin = self.sin[position_ids]
-
-        out = self.model(input_ids, sin, cos, causal_mask)
-        logits = self.lm_head(self.qdq_0(out))
-        return logits
-
-
-class Qwen3Quantizer:
-    def __init__(self):
-        # Other stuff
-        self.tokenizer: AutoTokenizer = None
-        self.model: Qwen3ForCausalLM = None
-        self.config: AutoConfig = None
-
-    def load_from_hf(self, model_path: str, verbose: bool = False):
-        self.config = AutoConfig.from_pretrained(model_path)
-        state_dict = AutoModelForCausalLM.from_pretrained(model_path).state_dict()
-        self.model = Qwen3ForCausalLM(self.config)
-
-        # Check if all original weight is in state_dict
-        model_keys = set(self.model.state_dict().keys())
-        loaded_keys = set(state_dict.keys())
-
-        # 1. Keys present in model but missing in state_dict
-        missing_keys = model_keys - loaded_keys
-        if missing_keys and verbose:
-            print(
-                f"\n⚠️  Keys present in model but missing in state_dict ({len(missing_keys)} keys):"
-            )
-            for k in sorted(missing_keys):
-                print(f"   - {k}")
-
-        # 2. Keys present in state_dict but unexpected in model
-        unexpected_keys = loaded_keys - model_keys
-        if unexpected_keys:
-            print(
-                f"\n⚠️  Keys present in state_dict but unexpected in model ({len(unexpected_keys)} keys):"
-            )
-            for k in sorted(unexpected_keys):
-                print(f"   - {k}")
-
-        self.model.load_state_dict(state_dict, strict=False)
-        self.model.cuda()
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-    def infer(self, prompt: str, enable_fake_quant: bool = True) -> str:
-        """
-        Generate response for the given prompt.
-
-        Args:
-            prompt: Input text prompt
-
-        Returns:
-            Generated text response
-        """
-        # Tokenize the input prompt
-        self.model.freeze_observer()
-        if not enable_fake_quant:
-            self.model.disable_fakequant()
-        if hasattr(self.tokenizer, "chat_template") and self.tokenizer.chat_template:
-            formatted_prompt = self.tokenizer.apply_chat_template(
-                [{"role": "user", "content": prompt}],
-                tokenize=False,
-                add_generation_prompt=True,
-                enable_thinking=False,
-            )
-        else:
-            formatted_prompt = prompt
-        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
-        input_ids = inputs["input_ids"]
-        seq_len = input_ids.shape[1]
-
-        # Initialize position_ids
-        position_ids = torch.arange(seq_len, dtype=torch.long, device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0)  # Add batch dimension
-
-        # Get max_length from config or use a default value
-        max_length = getattr(self.config, "max_position_embeddings", 2048)
-
-        # TODO remove this
-        max_length = 8
-
-        # Prefill stage: process the prompt and build KV cache
-        with torch.no_grad():
-            logits = self.model(
-                input_ids=input_ids,
-                position_ids=position_ids,
-                max_length=2048,
-            )
-
-        # Get the last token from prefill as the first generated token
-        next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
-        generated_tokens = next_token.clone()
-
-        # Decode stage: generate tokens one by one using KV cache
-        while generated_tokens.shape[1] < max_length:
-            # Update position_ids for the new token
-            new_position_id = position_ids[:, -1] + 1
-            position_ids = new_position_id.unsqueeze(0)
-
-            with torch.no_grad():
-                logits = self.model(
-                    input_ids=next_token,
-                    position_ids=position_ids,
-                    max_length=max_length,
-                )
-
-            # Get next token (greedy decoding)
-            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
-
-            # Append generated token
-            generated_tokens = torch.cat([generated_tokens, next_token], dim=1)
-
-            # Stop if EOS token is generated
-            if next_token.item() == self.tokenizer.eos_token_id:
-                break
-
-        # Decode generated tokens to text
-        generated_text = self.tokenizer.decode(
-            generated_tokens[0], skip_special_tokens=True
-        )
-
-        return generated_text
-
-    def calibrate(self, dataset_path: str):
-        """
-        Calibrate Only on PREFILL stage !!!
-        """
-        # Call infer after calibrate done.
-        pass
-
-
-if __name__ == "__main__":
-    quantizer = Qwen3Quantizer()
-    quantizer.load_from_hf("/mnt/user-ssd/shared_models/Qwen3-1.7B/")
-    result = quantizer.infer("hello")
-    print(result)
diff --git a/pymllm/backends/qualcomm/transformers/train.py b/pymllm/backends/qualcomm/transformers/train.py
deleted file mode 100644
index a36416a44..000000000
--- a/pymllm/backends/qualcomm/transformers/train.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer
-
-if __name__ == "__main__":
-    m = Qwen3Quantizer()
-    m.calibrate()
-    m.infer("简述中国断代史")
diff --git a/requirements-qnn-aot.txt b/requirements-qnn-aot.txt
index f3f435c9f..813462fee 100644
--- a/requirements-qnn-aot.txt
+++ b/requirements-qnn-aot.txt
@@ -1 +1,4 @@
 addict==2.4.0
+modelscope==1.33.0
+datasets==2.21.0
+transformers==4.57.3

From a99e5c6b7e0891cff4bd1ecceca152702326b3ec Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Sat, 3 Jan 2026 06:58:18 +0000
Subject: [PATCH 06/13] fix: qnn aot, qwen3 silu int16.

---
 .../qualcomm/transformers/qwen3/modeling_qwen3.py      | 10 ++++++++--
 pymllm/backends/qualcomm/transformers/qwen3/runner.py  |  4 +++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
index 1fe04f14d..f06019f2a 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
@@ -17,6 +17,7 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
@@ -68,7 +69,6 @@ def __init__(self, config):
         self.down_proj = QLinearLPBQ(
             self.intermediate_size, self.hidden_size, bias=False, block_size=32
         )
-        self.act_fn = ACT2FN[config.hidden_act]
 
         # QDQ
         self.up_proj_input_qdq = ActivationQDQ(bits=16)
@@ -76,12 +76,18 @@ def __init__(self, config):
         self.gate_proj_output_qdq = ActivationQDQ(bits=16)
         self.act_output_qdq = ActivationQDQ(bits=16)
         self.down_proj_input_qdq = ActivationQDQ(bits=16)
+        self.sigmoid_output_qdq = ActivationQDQ(bits=16)
 
     def forward(self, x):
         x = self.up_proj_input_qdq(x)
         up_result = self.up_proj_output_qdq(self.up_proj(x))
         gate_result = self.gate_proj_output_qdq(self.gate_proj(x))
-        gate_result = self.act_output_qdq(self.act_fn(gate_result))
+
+        # SiLU
+        gate_result = self.act_output_qdq(
+            gate_result * self.sigmoid_output_qdq(F.sigmoid(gate_result))
+        )
+
         o = self.down_proj_input_qdq(gate_result * up_result)
         o = self.down_proj(o)
         return o
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
index 0b4462f2b..082a6f0bf 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
@@ -65,7 +65,9 @@ def infer(self, prompt: str):
         # conduct text completion
         generated_ids = self.model.generate(
             **model_inputs,
-            max_new_tokens=128 - len(model_inputs.input_ids[0]) - 1,
+            max_new_tokens=self.mllm_qualcomm_max_length
+            - len(model_inputs.input_ids[0])
+            - 1,
             do_sample=False,
             temperature=None,
             top_p=None,

From 87746230f6a6dc099f74725bbc0b79ab3ffbb068 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Sat, 3 Jan 2026 15:21:42 +0000
Subject: [PATCH 07/13] fix: Save Qnn Qwen3 AOT Model.

---
 .../backends/qualcomm/transformers/qwen3/runner.py   |  7 +++++++
 pymllm/backends/qualcomm/transformers/qwen3/train.py | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
index 082a6f0bf..37f8bae16 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
@@ -52,6 +52,13 @@ def freeze_activation(self):
     def enable_activation_update(self):
         self.model.apply(enable_qdq_observer)
 
+    def compile(self):
+        print("Compile Start.")
+        self.model = torch.compile(
+            self.model, mode="reduce-overhead", fullgraph=False, backend="inductor"
+        )
+        print("Compile done.")
+
     def infer(self, prompt: str):
         messages = [{"role": "user", "content": prompt}]
         text = self.tokenizer.apply_chat_template(
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py
index 81e452903..746970020 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/train.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py
@@ -1,4 +1,6 @@
+import os
 import argparse
+from safetensors.torch import save_model
 from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer
 
 
@@ -25,13 +27,23 @@ def main():
         default="为什么伟大不能被计划",
         help="Text to run inference on",
     )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Directory to save the quantized model",
+    )
 
     args = parser.parse_args()
 
     m = Qwen3Quantizer(args.model_path, mllm_qualcomm_max_length=args.max_length)
     m.calibrate(num_samples=args.num_samples, max_seq_length=args.max_length)
+    # m.compile()
     m.infer(args.infer_text)
 
+    os.makedirs(args.output_dir, exist_ok=True)
+    model_save_path = os.path.join(args.output_dir, "model.safetensors")
+    save_model(m.model, model_save_path)
+
 
 if __name__ == "__main__":
     main()

From 00dd1b6a66bb53da3a8c1c8fe664cf57220261b9 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Sun, 4 Jan 2026 16:04:14 +0000
Subject: [PATCH 08/13] fix: PTQ pass in qualcomm AOT workflow.

---
 examples/qwen3_qnn_aot/compile.cpp            |   16 +-
 .../qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp   |  205 +-
 examples/qwen3_qnn_aot/qwen3_qnn_aot.mir      | 3500 +++++++++--------
 mllm/backends/cpu/CPUBackend.cpp              |   11 +-
 mllm/backends/cpu/kernels/Kernels.hpp         |    2 +
 mllm/backends/cpu/kernels/arm/sigmoid.cpp     |  131 +
 mllm/backends/cpu/kernels/arm/sigmoid.hpp     |   18 +
 mllm/backends/cpu/kernels/x86/sigmoid.cpp     |   47 +
 mllm/backends/cpu/kernels/x86/sigmoid.hpp     |   16 +
 mllm/backends/cpu/ops/LinearOp.cpp            |    2 +-
 mllm/backends/cpu/ops/SigmoidOp.cpp           |   41 +
 mllm/backends/cpu/ops/SigmoidOp.hpp           |   25 +
 .../qnn/aot/passes/AOTCompileContext.cpp      |    4 +
 .../qnn/aot/passes/AOTCompileContext.hpp      |    6 +
 mllm/backends/qnn/aot/passes/AOTPipeline.cpp  |    8 +-
 .../qnn/aot/passes/LLMQuantRecipePass.cpp     |  106 +-
 .../qnn/aot/passes/LLMQuantRecipePass.hpp     |   38 +
 mllm/backends/qnn/aot/passes/PTQPass.cpp      |   29 +
 mllm/backends/qnn/aot/passes/PTQPass.hpp      |   32 +
 mllm/compile/ir/GeneratedRTTIKind.hpp         |    3 +-
 mllm/compile/ir/NodeRTTIClassOfImpl.hpp       |    5 +-
 mllm/compile/ir/linalg/Attribute.hpp          |    1 +
 mllm/compile/ir/linalg/Op.cpp                 |    1 +
 mllm/compile/ir/linalg/Op.hpp                 |    2 +
 mllm/compile/ir/rtti_kind_gen.py              |    1 +
 mllm/core/DataTypes.cpp                       |   11 +-
 mllm/core/OpTypes.hpp                         |    2 +
 mllm/core/Tensor.cpp                          |   52 +-
 mllm/core/Tensor.hpp                          |   11 +-
 mllm/core/TensorViewImpl.hpp                  |    3 +
 mllm/core/aops/ElewiseOps.cpp                 |    2 +-
 mllm/core/aops/ParamOp.cpp                    |   12 +-
 mllm/core/aops/SigmoidOp.cpp                  |   37 +
 mllm/core/aops/SigmoidOp.hpp                  |   33 +
 mllm/nn/Functional.cpp                        |    6 +
 mllm/nn/Functional.hpp                        |    2 +
 mllm/nn/Module.cpp                            |    9 +-
 mllm/nn/Module.hpp                            |    6 +-
 .../qualcomm/transformers/core/qdq.py         |   56 +-
 .../transformers/qwen3/modeling_qwen3.py      |   23 +-
 .../qualcomm/transformers/qwen3/train.py      |    4 +
 pymllm/quantize/pipeline.py                   |    6 +
 pymllm/utils/mllm_convertor.py                |   18 +
 43 files changed, 2658 insertions(+), 1885 deletions(-)
 create mode 100644 mllm/backends/cpu/kernels/arm/sigmoid.cpp
 create mode 100644 mllm/backends/cpu/kernels/arm/sigmoid.hpp
 create mode 100644 mllm/backends/cpu/kernels/x86/sigmoid.cpp
 create mode 100644 mllm/backends/cpu/kernels/x86/sigmoid.hpp
 create mode 100644 mllm/backends/cpu/ops/SigmoidOp.cpp
 create mode 100644 mllm/backends/cpu/ops/SigmoidOp.hpp
 create mode 100644 mllm/core/aops/SigmoidOp.cpp
 create mode 100644 mllm/core/aops/SigmoidOp.hpp

diff --git a/examples/qwen3_qnn_aot/compile.cpp b/examples/qwen3_qnn_aot/compile.cpp
index 64bf41194..26f10be05 100644
--- a/examples/qwen3_qnn_aot/compile.cpp
+++ b/examples/qwen3_qnn_aot/compile.cpp
@@ -41,14 +41,12 @@ MLLM_MAIN({
   // Gen sin and cos
   {
     auto inv = mllm::models::qwen3::makeRoPEInvFreq(model_cfg.head_dim, model_cfg.rope_theta);
-    auto position_ids = mllm::Tensor::empty({1, CL}, mllm::kInt64, mllm::kCPU).alloc();
-    auto position_ids_ptr = position_ids.ptr<int64_t>();
-    for (int b = 0; b < 1; ++b) {
-      for (int s = 0; s < CL; ++s) { position_ids_ptr[b * CL + s] = s; }
-    }
+    auto position_ids = mllm::Tensor::empty({CL}, mllm::kInt32, mllm::kCPU).alloc();
+    auto position_ids_ptr = position_ids.ptr<int32_t>();
+    for (int s = 0; s < CL; ++s) { position_ids_ptr[s] = s; }
     auto [rope_sin, rope_cos] = mllm::models::qwen3::makeRotaryPosEmbedding(position_ids, inv, 1.f);
-    params->push("rope_sin", rope_sin.to(mllm::kInt16PerTensorSym));
-    params->push("rope_cos", rope_cos.to(mllm::kInt16PerTensorSym));
+    params->push("rope_sin", rope_sin.to(mllm::kUInt16PerTensorSym).setMemType(mllm::kParamsNormal).setName("rope_sin"));
+    params->push("rope_cos", rope_cos.to(mllm::kUInt16PerTensorSym).setMemType(mllm::kParamsNormal).setName("rope_cos"));
   }
   model.load(params);
 
@@ -56,7 +54,7 @@ MLLM_MAIN({
   // past_key_i: [B, H, D, CL-N] for each layer i
   // past_value_i: [B, H, CL-N, D] for each layer i
   // causal_mask: [B, 1, N, CL]
-  auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt64);
+  auto sequence = mllm::Tensor::zeros({1, N}, mllm::kInt32);
   auto causal_mask = mllm::Tensor::zeros({1, 1, N, CL}, mllm::kUInt16);
 
   // Create KV cache inputs for all layers
@@ -75,7 +73,7 @@ MLLM_MAIN({
         model_cfg.head_dim,
         CL - N,
     }, mllm::kInt8PerTensorSym);
-    trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kInt8PerTensorSym);
+    trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
     // clang-format on
   }
 
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
index e78e34c60..1f0da38e7 100644
--- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
+++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -4,16 +4,92 @@
 #pragma once
 
 #include "mllm/mllm.hpp"
-#include "mllm/nn/Module.hpp"
 #include "mllm/nn/Nn.hpp"
+#include "mllm/nn/Module.hpp"
 #include "mllm/nn/Functional.hpp"
-#include "mllm/models/qwen3/configuration_qwen3.hpp"
+#include "mllm/core/DataTypes.hpp"
 #include "mllm/utils/Enumerate.hpp"
-#include "mllm/models/ARGeneration.hpp"
 #include "mllm/compile/ir/Trace.hpp"
+#include "mllm/models/ARGeneration.hpp"
+#include "mllm/models/qwen3/configuration_qwen3.hpp"
 
 namespace mllm::models::qwen3 {
 
+Tensor rotateHalf(Tensor x) {  // NOLINT
+  // X is [x, x, x, D]
+  auto D = x.size(-1);
+  auto x1 = x[{kAll, kAll, kAll, {kAll, D / 2}}];
+  auto x2 = x[{kAll, kAll, kAll, {D / 2, kAll}}];
+  return nn::functional::concat({-x2, x1}, -1);
+}
+
+namespace ptq {
+
+Tensor QDQ(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) {
+  std::string scale_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.scale";
+  std::string zp_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.zero_point";
+
+  if (m->getModuleName().empty()) {
+    scale_name = qdq_name_in_pytorch + ".fake_quant.scale";
+    zp_name = qdq_name_in_pytorch + ".fake_quant.zero_point";
+  } else {
+    scale_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.scale";
+    zp_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.zero_point";
+  }
+
+  switch (in.dtype()) {
+    case kUInt16PerTensorAsy: {
+      auto scale = m->getTopParameterFile()->pull(scale_name);
+      auto zp = m->getTopParameterFile()->pull(zp_name);
+      in.attach("scale", scale.impl());
+      in.attach("zero_point", zp.impl());
+      break;
+    }
+    // For Constant!
+    case kFloat32: {
+      MLLM_RT_ASSERT_EQ(in.rank(), 1);
+      MLLM_RT_ASSERT_EQ(in.size(-1), 1);
+      auto scale = m->getTopParameterFile()->pull(scale_name);
+      auto zp = m->getTopParameterFile()->pull(zp_name);
+      in.attach("scale", scale.impl());
+      in.attach("zero_point", zp.impl());
+      break;
+    }
+    default: {
+      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't Process dtype={}", nameOfType(in.dtype()));
+    }
+  }
+
+  return in;
+}
+
+Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) {
+  auto scale_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.scale";
+  auto zp_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.zero_point";
+
+  // The inputs is int8 sym. which means zero_point should be changed.
+  switch (in.dtype()) {
+    case kUInt8PerTensorSym: {
+      auto scale = m->getTopParameterFile()->pull(scale_name);
+      auto zp = m->getTopParameterFile()->pull(zp_name);
+      MLLM_RT_ASSERT_EQ(zp.item<mllm_int32_t>(), 0);
+
+      // Is 128! not 127!
+      auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
+      in.attach("scale", scale.impl());
+      in.attach("zero_point", new_zp.impl());
+      break;
+    }
+    default: {
+      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't Process dtype={}", nameOfType(in.dtype()));
+    }
+  }
+
+  return in;
+}
+
+}  // namespace ptq
+
 inline auto makeRoPEInvFreq(int output_dim, float rope_theta) -> Tensor {
   auto inv_freq = Tensor::empty({output_dim / 2}, kFloat32, kCPU).alloc();
   auto inv_freq_ptr = inv_freq.ptr<float>();
@@ -86,12 +162,19 @@ class Qwen3MLP final : public nn::Module {
   }
 
   std::vector<Tensor> forward(const std::vector<Tensor>& inputs, const std::vector<AnyValue>& args) override {
-    auto x = gate_proj_(inputs[0]);
-    x = silu_(x);
-    auto y = up_proj_(inputs[0]);
-    x = x * y;
-    x = down_proj_(x);
-    return {x};
+    auto x = inputs[0];
+    x = ptq::QDQ(this, x, "up_proj_input_qdq");
+    auto up_result = ptq::QDQ(this, up_proj_(x), "up_proj_output_qdq");
+    auto gate_result = ptq::QDQ(this, gate_proj_(x), "gate_proj_output_qdq");
+
+    // SiLU
+    gate_result = ptq::QDQ(this, (gate_result * ptq::QDQ(this, nn::functional::sigmoid(gate_result), "sigmoid_output_qdq")),
+                           "act_output_qdq");
+
+    auto o = ptq::QDQ(this, gate_result * up_result, "down_proj_input_qdq");
+    o = down_proj_(o);
+
+    return {o};
   }
 };
 
@@ -102,8 +185,6 @@ class Qwen3Attention final : public nn::Module {
   nn::Linear o_proj_;
   nn::RMSNorm rms_norm_q_;
   nn::RMSNorm rms_norm_k_;
-  nn::RoPE q_rope_;
-  nn::RoPE k_rope_;
   nn::CausalMask mask_;
   nn::Softmax softmax_;
 
@@ -135,25 +216,24 @@ class Qwen3Attention final : public nn::Module {
     rms_norm_q_ = reg<nn::RMSNorm>("q_norm", cfg.rms_norm_eps);
     rms_norm_k_ = reg<nn::RMSNorm>("k_norm", cfg.rms_norm_eps);
 
-    q_rope_ = reg<nn::RoPE>("q_rope", cfg.rope_theta, cfg.max_position_embeddings);
-    k_rope_ = reg<nn::RoPE>("k_rope", cfg.rope_theta, cfg.max_position_embeddings);
-
     mask_ = reg<nn::CausalMask>("mask");
     softmax_ = reg<nn::Softmax>("softmax", -1);
   }
 
   std::vector<Tensor> forward(const std::vector<Tensor>& inputs, const std::vector<AnyValue>& args) override {
-    auto x = inputs[0];
+    auto hidden_states = inputs[0];
     auto llm_embedding_sin = inputs[1];
     auto llm_embedding_cos = inputs[2];
     auto causal_mask = inputs[3];
     auto past_key = inputs[4];
     auto past_value = inputs[5];
 
+    hidden_states = ptq::QDQ(this, hidden_states, "q_proj_input_qdq");
+
     // [B, S, H * D]
-    auto query_states = q_proj_(x);
-    auto key_states = k_proj_(x);
-    auto value_states = v_proj_(x);
+    auto query_states = q_proj_(hidden_states);
+    auto key_states = k_proj_(hidden_states);
+    auto value_states = v_proj_(hidden_states);
 
     // [B, H, S, D]
     query_states = query_states.view({1, -1, num_attention_heads_, head_dim_}).transpose(1, 2);
@@ -161,23 +241,38 @@ class Qwen3Attention final : public nn::Module {
     value_states = value_states.view({1, -1, num_key_value_heads_, head_dim_}).transpose(1, 2);
 
     // [B, H, S, D]
-    query_states = rms_norm_q_(query_states);
-    key_states = rms_norm_k_(key_states);
+    query_states = rms_norm_q_(ptq::QDQ(this, query_states, "q_norm_input_qdq"));
+    key_states = rms_norm_k_(ptq::QDQ(this, key_states, "k_norm_input_qdq"));
+
+    query_states = ptq::QDQ(this, query_states, "q_norm_output_qdq");
+    key_states = ptq::QDQ(this, key_states, "k_norm_output_qdq");
 
     // [B, H, S, D]
-    query_states = q_rope_(query_states, llm_embedding_sin, llm_embedding_cos);
-    key_states = k_rope_(key_states, llm_embedding_sin, llm_embedding_cos);
+    auto cos = llm_embedding_cos.unsqueeze(1);
+    auto sin = llm_embedding_sin.unsqueeze(1);
+    query_states = ptq::QDQ(this,
+                            ptq::QDQ(this, query_states * cos, "q_rope_mul_0_output_qdq")
+                                + ptq::QDQ(this, rotateHalf(query_states) * sin, "q_rope_mul_1_output_qdq"),
+                            "q_rope_add_0_output_qdq");
+    key_states = ptq::QDQ(this,
+                          ptq::QDQ(this, key_states * cos, "k_rope_mul_0_output_qdq")
+                              + ptq::QDQ(this, rotateHalf(key_states) * sin, "k_rope_mul_1_output_qdq"),
+                          "k_rope_add_0_output_qdq");
 
     // De-quantization and quantization again
     key_states = key_states.to(kFloat16);
-    key_states = key_states.to(kInt8PerTensorSym);
+    key_states = key_states.to(kUInt8PerTensorSym);
+    key_states = ptq::QDQ_KV(this, key_states, "k_cast_to_int8_qdq");
 
     // [B, H, D, S]
     key_states = key_states.transpose(2, 3);
 
     // Handle KV Cache
+    value_states = ptq::QDQ(this, value_states, "v_cast_to_int16_qdq");
     value_states = value_states.to(kFloat16);
-    value_states = value_states.to(kInt8PerTensorSym);
+    value_states = value_states.to(kUInt8PerTensorSym);
+    value_states = ptq::QDQ_KV(this, value_states, "v_cast_to_int8_qdq");
+
     auto kh = nn::functional::concat({past_key, key_states}, -1);     // [B, H, D, S]
     auto vh = nn::functional::concat({past_value, value_states}, 2);  // [B, H, S, D]
 
@@ -186,15 +281,18 @@ class Qwen3Attention final : public nn::Module {
     vh = vh.repeat(num_key_value_groups_, 1);
 
     // Attn
-    auto attn = nn::functional::matmul(query_states, kh);
-    attn = attn.mul(scale_, kFloat32);
+    auto attn = ptq::QDQ(this, nn::functional::matmul(query_states, kh), "qk_matmul_output_qdq");
+    auto scale = Tensor::constant(scale_, kFloat32);
+    scale = ptq::QDQ(this, scale, "scaling_qdq");
+    attn = ptq::QDQ(this, attn.mulConstant(scale), "mul_0_output_qdq");
 
     // Masked Softmax
-    auto attn_min = attn.min(-1, true);
-    float minus_value = -20;
-    attn = nn::functional::where(causal_mask.equal(0.f), attn, attn_min.add(minus_value, kInt16));
-    attn = nn::functional::softmax(attn, -1);
-    auto y = nn::functional::matmul(attn, vh);
+    auto attn_min = ptq::QDQ(this, attn.min(-1, true), "reduce_min_output_qdq");
+    auto minus_value = Tensor::constant(-20, kFloat32);
+    minus_value = ptq::QDQ(this, minus_value, "neg_20_qdq");
+    attn = nn::functional::where(causal_mask.equal(0.f), attn, attn_min.addConstant(minus_value));
+    attn = ptq::QDQ(this, nn::functional::softmax(attn, -1), "softmax_output_qdq");
+    auto y = ptq::QDQ(this, nn::functional::matmul(attn, vh), "attn_value_matmul_output_qdq");
     y = y.transpose(1, 2).view({1, -1, num_attention_heads_ * head_dim_});
     y = o_proj_(y);
 
@@ -227,14 +325,18 @@ class Qwen3Decoder final : public nn::Module {
     auto past_key = inputs[4];
     auto past_value = inputs[5];
 
-    auto x = input_layer_norm_(inputs[0]);
-    auto _ = self_attn_(x, llm_embedding_sin, llm_embedding_cos, causal_mask, past_key, past_value);
-    x = _[0];
-    auto tmp = x + inputs[0];
-    x = post_attention_layer_norm_(tmp);
-    x = mlp_(x)[0];
-    x = x + tmp;
-    return {x, _[1], _[2]};
+    auto hidden_states = inputs[0];
+    hidden_states = ptq::QDQ(this, hidden_states, "input_layernorm_input_qdq");
+    auto residual = hidden_states;
+    hidden_states = input_layer_norm_(hidden_states);
+    auto _ = self_attn_(hidden_states, llm_embedding_sin, llm_embedding_cos, causal_mask, past_key, past_value);
+    hidden_states = _[0];
+    hidden_states = ptq::QDQ(this, residual + ptq::QDQ(this, hidden_states, "add_0_lhs_input_qdq"), "add_0_output_qdq");
+    residual = hidden_states;
+    hidden_states = post_attention_layer_norm_(hidden_states);
+    hidden_states = mlp_(hidden_states)[0];
+    hidden_states = residual + ptq::QDQ(this, hidden_states, "add_1_lhs_input_qdq");
+    return {hidden_states, _[1], _[2]};
   }
 };
 
@@ -266,13 +368,13 @@ class Qwen3Text final : public nn::Module {
     auto x = embedding_(inputs[0]);
 
     // Quantization
-    x = x.to(kInt16PerTensorSym);
+    x = x.to(kUInt16PerTensorAsy);
 
     auto position_ids = inputs[1];
     auto causal_mask = inputs[2];
     position_ids = position_ids.squeeze(0);
-    auto llm_embedding_sin = rope_sin_.weight()[{{0}, position_ids, {kAll}}];
-    auto llm_embedding_cos = rope_cos_.weight()[{{0}, position_ids, {kAll}}];
+    auto llm_embedding_sin = rope_sin_()[{{0}, position_ids, {kAll}}];
+    auto llm_embedding_cos = rope_cos_()[{{0}, position_ids, {kAll}}];
 
     std::vector<Tensor> keys;
     std::vector<Tensor> values;
@@ -285,7 +387,7 @@ class Qwen3Text final : public nn::Module {
       values.push_back(_[2]);
     }
 
-    x = norm_(x);
+    x = norm_(ptq::QDQ(this, x, "norm_input_qdq"));
 
     auto ret = std::vector<Tensor>{x};
     for (const auto& item : keys) { ret.push_back(item); }
@@ -357,17 +459,15 @@ class Qwen3ForCausalLM : public ARGeneration, public nn::Module {
 
       // For decode phase, increment the last position
       if (seq_len == 1) {
-        auto last_pos = *position_ids.offsettedPtr<int64_t>({0, position_ids.shape()[1] - 1});
-        position_ids = Tensor::empty({batch_size, 1}, kInt64, kCPU).alloc();
-        *position_ids.offsettedPtr<int64_t>({0, 0}) = last_pos + 1;
+        auto last_pos = *position_ids.offsettedPtr<int32_t>({0, position_ids.shape()[1] - 1});
+        position_ids = Tensor::empty({batch_size, 1}, kInt32, kCPU).alloc();
+        *position_ids.offsettedPtr<int32_t>({0, 0}) = last_pos + 1;
       }
     } else {
       // Generate position_ids for prefill phase
-      position_ids = Tensor::empty({batch_size, seq_len}, kInt64, kCPU).alloc();
-      auto position_ids_ptr = position_ids.ptr<int64_t>();
-      for (int b = 0; b < batch_size; ++b) {
-        for (int s = 0; s < seq_len; ++s) { position_ids_ptr[b * seq_len + s] = s; }
-      }
+      position_ids = Tensor::empty({batch_size, seq_len}, kInt32, kCPU).alloc();
+      auto position_ids_ptr = position_ids.ptr<int32_t>();
+      for (int s = 0; s < seq_len; ++s) { position_ids_ptr[s] = s; }
     }
 
     ir::lowlevel::traceStart();
@@ -377,7 +477,8 @@ class Qwen3ForCausalLM : public ARGeneration, public nn::Module {
     llm_inputs.insert(llm_inputs.end(), kv_caches.begin(), kv_caches.end());
 
     sequence = llm(llm_inputs)[0];
-    sequence = lm_head_(sequence);
+    sequence = lm_head_(ptq::QDQ(this, sequence, "lm_head_input_qdq"));
+    ptq::QDQ(this, sequence, "lm_head_output_qdq");
     ir::lowlevel::traceComment("    ╔═════╗   ");
     ir::lowlevel::traceComment("   ║  o o  ║  ");
     ir::lowlevel::traceComment("   ║   ▽   ║  ");
diff --git a/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir b/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir
index 6ca20f7af..1caff3b4a 100644
--- a/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir
+++ b/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir
@@ -1,317 +1,319 @@
 @main () -> () {
     graph.SubGraphOp @init <notype> [symbol:init] {
         () -> () {
-            tensor.CPU.register () -> (%105:tensor<[151936, 2048], Float32, CPU>[@model.embed_tokens.weight][symbol:model.embed_tokens.weight])[symbol:model.embed_tokens.weight]
-            tensor.CPU.register () -> (%199:tensor<[2048], Float32, CPU>[@model.layers.0.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), symbol:model.layers.0.input_layernorm.weight])[symbol:model.layers.0.input_layernorm.weight]
-            tensor.CPU.register () -> (%76:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.q_proj.weight][symbol:model.layers.0.self_attn.q_proj.weight])[symbol:model.layers.0.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%133:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=67), symbol:model.layers.0.self_attn.k_proj.weight])[symbol:model.layers.0.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%179:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=69), symbol:model.layers.0.self_attn.v_proj.weight])[symbol:model.layers.0.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%200:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=73), symbol:model.layers.0.self_attn.q_norm.weight])[symbol:model.layers.0.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%291:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), symbol:model.layers.0.self_attn.k_norm.weight])[symbol:model.layers.0.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%269:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=88), symbol:model.layers.0.self_attn.o_proj.weight])[symbol:model.layers.0.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%40:tensor<[2048], Float32, CPU>[@model.layers.0.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), symbol:model.layers.0.post_attention_layernorm.weight])[symbol:model.layers.0.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%9:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=92), symbol:model.layers.0.mlp.gate_proj.weight])[symbol:model.layers.0.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%111:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=95), symbol:model.layers.0.mlp.up_proj.weight])[symbol:model.layers.0.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%184:tensor<[2048, 6144], Float32, CPU>[@model.layers.0.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=97), symbol:model.layers.0.mlp.down_proj.weight])[symbol:model.layers.0.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%180:tensor<[2048], Float32, CPU>[@model.layers.1.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), symbol:model.layers.1.input_layernorm.weight])[symbol:model.layers.1.input_layernorm.weight]
-            tensor.CPU.register () -> (%285:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.q_proj.weight][symbol:model.layers.1.self_attn.q_proj.weight])[symbol:model.layers.1.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%32:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=101), symbol:model.layers.1.self_attn.k_proj.weight])[symbol:model.layers.1.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%154:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=103), symbol:model.layers.1.self_attn.v_proj.weight])[symbol:model.layers.1.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%131:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=107), symbol:model.layers.1.self_attn.q_norm.weight])[symbol:model.layers.1.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%68:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), symbol:model.layers.1.self_attn.k_norm.weight])[symbol:model.layers.1.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%20:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=122), symbol:model.layers.1.self_attn.o_proj.weight])[symbol:model.layers.1.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%73:tensor<[2048], Float32, CPU>[@model.layers.1.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), symbol:model.layers.1.post_attention_layernorm.weight])[symbol:model.layers.1.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%245:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=126), symbol:model.layers.1.mlp.gate_proj.weight])[symbol:model.layers.1.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%230:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=129), symbol:model.layers.1.mlp.up_proj.weight])[symbol:model.layers.1.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%43:tensor<[2048, 6144], Float32, CPU>[@model.layers.1.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=131), symbol:model.layers.1.mlp.down_proj.weight])[symbol:model.layers.1.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%86:tensor<[2048], Float32, CPU>[@model.layers.2.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), symbol:model.layers.2.input_layernorm.weight])[symbol:model.layers.2.input_layernorm.weight]
-            tensor.CPU.register () -> (%221:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.q_proj.weight][symbol:model.layers.2.self_attn.q_proj.weight])[symbol:model.layers.2.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%103:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=135), symbol:model.layers.2.self_attn.k_proj.weight])[symbol:model.layers.2.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%47:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=137), symbol:model.layers.2.self_attn.v_proj.weight])[symbol:model.layers.2.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%65:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=141), symbol:model.layers.2.self_attn.q_norm.weight])[symbol:model.layers.2.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%16:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), symbol:model.layers.2.self_attn.k_norm.weight])[symbol:model.layers.2.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%85:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=156), symbol:model.layers.2.self_attn.o_proj.weight])[symbol:model.layers.2.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%128:tensor<[2048], Float32, CPU>[@model.layers.2.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), symbol:model.layers.2.post_attention_layernorm.weight])[symbol:model.layers.2.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%252:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=160), symbol:model.layers.2.mlp.gate_proj.weight])[symbol:model.layers.2.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%24:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=163), symbol:model.layers.2.mlp.up_proj.weight])[symbol:model.layers.2.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%28:tensor<[2048, 6144], Float32, CPU>[@model.layers.2.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=165), symbol:model.layers.2.mlp.down_proj.weight])[symbol:model.layers.2.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%1:tensor<[2048], Float32, CPU>[@model.layers.3.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), symbol:model.layers.3.input_layernorm.weight])[symbol:model.layers.3.input_layernorm.weight]
-            tensor.CPU.register () -> (%283:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.q_proj.weight][symbol:model.layers.3.self_attn.q_proj.weight])[symbol:model.layers.3.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%48:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=169), symbol:model.layers.3.self_attn.k_proj.weight])[symbol:model.layers.3.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%244:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=171), symbol:model.layers.3.self_attn.v_proj.weight])[symbol:model.layers.3.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%33:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=175), symbol:model.layers.3.self_attn.q_norm.weight])[symbol:model.layers.3.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%202:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), symbol:model.layers.3.self_attn.k_norm.weight])[symbol:model.layers.3.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%301:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=190), symbol:model.layers.3.self_attn.o_proj.weight])[symbol:model.layers.3.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%223:tensor<[2048], Float32, CPU>[@model.layers.3.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), symbol:model.layers.3.post_attention_layernorm.weight])[symbol:model.layers.3.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%129:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=194), symbol:model.layers.3.mlp.gate_proj.weight])[symbol:model.layers.3.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%188:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=197), symbol:model.layers.3.mlp.up_proj.weight])[symbol:model.layers.3.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%97:tensor<[2048, 6144], Float32, CPU>[@model.layers.3.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=199), symbol:model.layers.3.mlp.down_proj.weight])[symbol:model.layers.3.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%3:tensor<[2048], Float32, CPU>[@model.layers.4.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), symbol:model.layers.4.input_layernorm.weight])[symbol:model.layers.4.input_layernorm.weight]
-            tensor.CPU.register () -> (%164:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.q_proj.weight][symbol:model.layers.4.self_attn.q_proj.weight])[symbol:model.layers.4.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%148:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=203), symbol:model.layers.4.self_attn.k_proj.weight])[symbol:model.layers.4.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%279:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=205), symbol:model.layers.4.self_attn.v_proj.weight])[symbol:model.layers.4.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%145:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=209), symbol:model.layers.4.self_attn.q_norm.weight])[symbol:model.layers.4.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%282:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), symbol:model.layers.4.self_attn.k_norm.weight])[symbol:model.layers.4.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%91:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=224), symbol:model.layers.4.self_attn.o_proj.weight])[symbol:model.layers.4.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%258:tensor<[2048], Float32, CPU>[@model.layers.4.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), symbol:model.layers.4.post_attention_layernorm.weight])[symbol:model.layers.4.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%189:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=228), symbol:model.layers.4.mlp.gate_proj.weight])[symbol:model.layers.4.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%156:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=231), symbol:model.layers.4.mlp.up_proj.weight])[symbol:model.layers.4.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%153:tensor<[2048, 6144], Float32, CPU>[@model.layers.4.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=233), symbol:model.layers.4.mlp.down_proj.weight])[symbol:model.layers.4.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%256:tensor<[2048], Float32, CPU>[@model.layers.5.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), symbol:model.layers.5.input_layernorm.weight])[symbol:model.layers.5.input_layernorm.weight]
-            tensor.CPU.register () -> (%78:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.q_proj.weight][symbol:model.layers.5.self_attn.q_proj.weight])[symbol:model.layers.5.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%72:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=237), symbol:model.layers.5.self_attn.k_proj.weight])[symbol:model.layers.5.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%289:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=239), symbol:model.layers.5.self_attn.v_proj.weight])[symbol:model.layers.5.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%225:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=243), symbol:model.layers.5.self_attn.q_norm.weight])[symbol:model.layers.5.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%7:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), symbol:model.layers.5.self_attn.k_norm.weight])[symbol:model.layers.5.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%264:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=258), symbol:model.layers.5.self_attn.o_proj.weight])[symbol:model.layers.5.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%99:tensor<[2048], Float32, CPU>[@model.layers.5.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), symbol:model.layers.5.post_attention_layernorm.weight])[symbol:model.layers.5.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%4:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=262), symbol:model.layers.5.mlp.gate_proj.weight])[symbol:model.layers.5.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%308:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265), symbol:model.layers.5.mlp.up_proj.weight])[symbol:model.layers.5.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%74:tensor<[2048, 6144], Float32, CPU>[@model.layers.5.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=267), symbol:model.layers.5.mlp.down_proj.weight])[symbol:model.layers.5.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%132:tensor<[2048], Float32, CPU>[@model.layers.6.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), symbol:model.layers.6.input_layernorm.weight])[symbol:model.layers.6.input_layernorm.weight]
-            tensor.CPU.register () -> (%59:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.q_proj.weight][symbol:model.layers.6.self_attn.q_proj.weight])[symbol:model.layers.6.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%208:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271), symbol:model.layers.6.self_attn.k_proj.weight])[symbol:model.layers.6.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%238:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=273), symbol:model.layers.6.self_attn.v_proj.weight])[symbol:model.layers.6.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%294:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=277), symbol:model.layers.6.self_attn.q_norm.weight])[symbol:model.layers.6.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%71:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), symbol:model.layers.6.self_attn.k_norm.weight])[symbol:model.layers.6.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%52:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=292), symbol:model.layers.6.self_attn.o_proj.weight])[symbol:model.layers.6.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%108:tensor<[2048], Float32, CPU>[@model.layers.6.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), symbol:model.layers.6.post_attention_layernorm.weight])[symbol:model.layers.6.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%80:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=296), symbol:model.layers.6.mlp.gate_proj.weight])[symbol:model.layers.6.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%276:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=299), symbol:model.layers.6.mlp.up_proj.weight])[symbol:model.layers.6.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%227:tensor<[2048, 6144], Float32, CPU>[@model.layers.6.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=301), symbol:model.layers.6.mlp.down_proj.weight])[symbol:model.layers.6.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%107:tensor<[2048], Float32, CPU>[@model.layers.7.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), symbol:model.layers.7.input_layernorm.weight])[symbol:model.layers.7.input_layernorm.weight]
-            tensor.CPU.register () -> (%287:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.q_proj.weight][symbol:model.layers.7.self_attn.q_proj.weight])[symbol:model.layers.7.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%135:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=305), symbol:model.layers.7.self_attn.k_proj.weight])[symbol:model.layers.7.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%300:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=307), symbol:model.layers.7.self_attn.v_proj.weight])[symbol:model.layers.7.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%23:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=311), symbol:model.layers.7.self_attn.q_norm.weight])[symbol:model.layers.7.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%137:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), symbol:model.layers.7.self_attn.k_norm.weight])[symbol:model.layers.7.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%251:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=326), symbol:model.layers.7.self_attn.o_proj.weight])[symbol:model.layers.7.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%53:tensor<[2048], Float32, CPU>[@model.layers.7.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), symbol:model.layers.7.post_attention_layernorm.weight])[symbol:model.layers.7.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%155:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=330), symbol:model.layers.7.mlp.gate_proj.weight])[symbol:model.layers.7.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%218:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=333), symbol:model.layers.7.mlp.up_proj.weight])[symbol:model.layers.7.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%275:tensor<[2048, 6144], Float32, CPU>[@model.layers.7.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=335), symbol:model.layers.7.mlp.down_proj.weight])[symbol:model.layers.7.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%171:tensor<[2048], Float32, CPU>[@model.layers.8.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), symbol:model.layers.8.input_layernorm.weight])[symbol:model.layers.8.input_layernorm.weight]
-            tensor.CPU.register () -> (%165:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.q_proj.weight][symbol:model.layers.8.self_attn.q_proj.weight])[symbol:model.layers.8.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%194:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=339), symbol:model.layers.8.self_attn.k_proj.weight])[symbol:model.layers.8.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%181:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=341), symbol:model.layers.8.self_attn.v_proj.weight])[symbol:model.layers.8.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%309:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=345), symbol:model.layers.8.self_attn.q_norm.weight])[symbol:model.layers.8.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%92:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), symbol:model.layers.8.self_attn.k_norm.weight])[symbol:model.layers.8.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%197:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=360), symbol:model.layers.8.self_attn.o_proj.weight])[symbol:model.layers.8.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%122:tensor<[2048], Float32, CPU>[@model.layers.8.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), symbol:model.layers.8.post_attention_layernorm.weight])[symbol:model.layers.8.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%110:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=364), symbol:model.layers.8.mlp.gate_proj.weight])[symbol:model.layers.8.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%236:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=367), symbol:model.layers.8.mlp.up_proj.weight])[symbol:model.layers.8.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%106:tensor<[2048, 6144], Float32, CPU>[@model.layers.8.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=369), symbol:model.layers.8.mlp.down_proj.weight])[symbol:model.layers.8.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%178:tensor<[2048], Float32, CPU>[@model.layers.9.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), symbol:model.layers.9.input_layernorm.weight])[symbol:model.layers.9.input_layernorm.weight]
-            tensor.CPU.register () -> (%235:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.q_proj.weight][symbol:model.layers.9.self_attn.q_proj.weight])[symbol:model.layers.9.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%69:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=373), symbol:model.layers.9.self_attn.k_proj.weight])[symbol:model.layers.9.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%120:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=375), symbol:model.layers.9.self_attn.v_proj.weight])[symbol:model.layers.9.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%140:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=379), symbol:model.layers.9.self_attn.q_norm.weight])[symbol:model.layers.9.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%29:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), symbol:model.layers.9.self_attn.k_norm.weight])[symbol:model.layers.9.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%205:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=394), symbol:model.layers.9.self_attn.o_proj.weight])[symbol:model.layers.9.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%304:tensor<[2048], Float32, CPU>[@model.layers.9.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), symbol:model.layers.9.post_attention_layernorm.weight])[symbol:model.layers.9.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%263:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=398), symbol:model.layers.9.mlp.gate_proj.weight])[symbol:model.layers.9.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%102:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=401), symbol:model.layers.9.mlp.up_proj.weight])[symbol:model.layers.9.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%136:tensor<[2048, 6144], Float32, CPU>[@model.layers.9.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=403), symbol:model.layers.9.mlp.down_proj.weight])[symbol:model.layers.9.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%186:tensor<[2048], Float32, CPU>[@model.layers.10.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), symbol:model.layers.10.input_layernorm.weight])[symbol:model.layers.10.input_layernorm.weight]
-            tensor.CPU.register () -> (%278:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.q_proj.weight][symbol:model.layers.10.self_attn.q_proj.weight])[symbol:model.layers.10.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%182:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=407), symbol:model.layers.10.self_attn.k_proj.weight])[symbol:model.layers.10.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%138:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=409), symbol:model.layers.10.self_attn.v_proj.weight])[symbol:model.layers.10.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%305:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=413), symbol:model.layers.10.self_attn.q_norm.weight])[symbol:model.layers.10.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%272:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), symbol:model.layers.10.self_attn.k_norm.weight])[symbol:model.layers.10.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%233:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=428), symbol:model.layers.10.self_attn.o_proj.weight])[symbol:model.layers.10.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%266:tensor<[2048], Float32, CPU>[@model.layers.10.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), symbol:model.layers.10.post_attention_layernorm.weight])[symbol:model.layers.10.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%124:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=432), symbol:model.layers.10.mlp.gate_proj.weight])[symbol:model.layers.10.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%261:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=435), symbol:model.layers.10.mlp.up_proj.weight])[symbol:model.layers.10.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%45:tensor<[2048, 6144], Float32, CPU>[@model.layers.10.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=437), symbol:model.layers.10.mlp.down_proj.weight])[symbol:model.layers.10.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%219:tensor<[2048], Float32, CPU>[@model.layers.11.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), symbol:model.layers.11.input_layernorm.weight])[symbol:model.layers.11.input_layernorm.weight]
-            tensor.CPU.register () -> (%274:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.q_proj.weight][symbol:model.layers.11.self_attn.q_proj.weight])[symbol:model.layers.11.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%157:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=441), symbol:model.layers.11.self_attn.k_proj.weight])[symbol:model.layers.11.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%63:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=443), symbol:model.layers.11.self_attn.v_proj.weight])[symbol:model.layers.11.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%214:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=447), symbol:model.layers.11.self_attn.q_norm.weight])[symbol:model.layers.11.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%201:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), symbol:model.layers.11.self_attn.k_norm.weight])[symbol:model.layers.11.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%118:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=462), symbol:model.layers.11.self_attn.o_proj.weight])[symbol:model.layers.11.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%151:tensor<[2048], Float32, CPU>[@model.layers.11.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), symbol:model.layers.11.post_attention_layernorm.weight])[symbol:model.layers.11.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%207:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=466), symbol:model.layers.11.mlp.gate_proj.weight])[symbol:model.layers.11.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%226:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=469), symbol:model.layers.11.mlp.up_proj.weight])[symbol:model.layers.11.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%224:tensor<[2048, 6144], Float32, CPU>[@model.layers.11.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=471), symbol:model.layers.11.mlp.down_proj.weight])[symbol:model.layers.11.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%55:tensor<[2048], Float32, CPU>[@model.layers.12.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), symbol:model.layers.12.input_layernorm.weight])[symbol:model.layers.12.input_layernorm.weight]
-            tensor.CPU.register () -> (%217:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.q_proj.weight][symbol:model.layers.12.self_attn.q_proj.weight])[symbol:model.layers.12.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%297:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475), symbol:model.layers.12.self_attn.k_proj.weight])[symbol:model.layers.12.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%94:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=477), symbol:model.layers.12.self_attn.v_proj.weight])[symbol:model.layers.12.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%161:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=481), symbol:model.layers.12.self_attn.q_norm.weight])[symbol:model.layers.12.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%277:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), symbol:model.layers.12.self_attn.k_norm.weight])[symbol:model.layers.12.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%49:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=496), symbol:model.layers.12.self_attn.o_proj.weight])[symbol:model.layers.12.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%14:tensor<[2048], Float32, CPU>[@model.layers.12.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), symbol:model.layers.12.post_attention_layernorm.weight])[symbol:model.layers.12.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%262:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=500), symbol:model.layers.12.mlp.gate_proj.weight])[symbol:model.layers.12.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%255:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=503), symbol:model.layers.12.mlp.up_proj.weight])[symbol:model.layers.12.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%22:tensor<[2048, 6144], Float32, CPU>[@model.layers.12.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=505), symbol:model.layers.12.mlp.down_proj.weight])[symbol:model.layers.12.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%212:tensor<[2048], Float32, CPU>[@model.layers.13.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), symbol:model.layers.13.input_layernorm.weight])[symbol:model.layers.13.input_layernorm.weight]
-            tensor.CPU.register () -> (%114:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.q_proj.weight][symbol:model.layers.13.self_attn.q_proj.weight])[symbol:model.layers.13.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%152:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=509), symbol:model.layers.13.self_attn.k_proj.weight])[symbol:model.layers.13.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%15:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=511), symbol:model.layers.13.self_attn.v_proj.weight])[symbol:model.layers.13.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%307:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=515), symbol:model.layers.13.self_attn.q_norm.weight])[symbol:model.layers.13.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%30:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), symbol:model.layers.13.self_attn.k_norm.weight])[symbol:model.layers.13.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%250:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=530), symbol:model.layers.13.self_attn.o_proj.weight])[symbol:model.layers.13.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%160:tensor<[2048], Float32, CPU>[@model.layers.13.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), symbol:model.layers.13.post_attention_layernorm.weight])[symbol:model.layers.13.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%247:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=534), symbol:model.layers.13.mlp.gate_proj.weight])[symbol:model.layers.13.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%98:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=537), symbol:model.layers.13.mlp.up_proj.weight])[symbol:model.layers.13.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%193:tensor<[2048, 6144], Float32, CPU>[@model.layers.13.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=539), symbol:model.layers.13.mlp.down_proj.weight])[symbol:model.layers.13.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%246:tensor<[2048], Float32, CPU>[@model.layers.14.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), symbol:model.layers.14.input_layernorm.weight])[symbol:model.layers.14.input_layernorm.weight]
-            tensor.CPU.register () -> (%209:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.q_proj.weight][symbol:model.layers.14.self_attn.q_proj.weight])[symbol:model.layers.14.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%38:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=543), symbol:model.layers.14.self_attn.k_proj.weight])[symbol:model.layers.14.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%232:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=545), symbol:model.layers.14.self_attn.v_proj.weight])[symbol:model.layers.14.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%0:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=549), symbol:model.layers.14.self_attn.q_norm.weight])[symbol:model.layers.14.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%57:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), symbol:model.layers.14.self_attn.k_norm.weight])[symbol:model.layers.14.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%168:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=564), symbol:model.layers.14.self_attn.o_proj.weight])[symbol:model.layers.14.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%75:tensor<[2048], Float32, CPU>[@model.layers.14.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), symbol:model.layers.14.post_attention_layernorm.weight])[symbol:model.layers.14.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%37:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=568), symbol:model.layers.14.mlp.gate_proj.weight])[symbol:model.layers.14.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%147:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=571), symbol:model.layers.14.mlp.up_proj.weight])[symbol:model.layers.14.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%163:tensor<[2048, 6144], Float32, CPU>[@model.layers.14.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=573), symbol:model.layers.14.mlp.down_proj.weight])[symbol:model.layers.14.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%67:tensor<[2048], Float32, CPU>[@model.layers.15.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), symbol:model.layers.15.input_layernorm.weight])[symbol:model.layers.15.input_layernorm.weight]
-            tensor.CPU.register () -> (%46:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.q_proj.weight][symbol:model.layers.15.self_attn.q_proj.weight])[symbol:model.layers.15.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%268:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=577), symbol:model.layers.15.self_attn.k_proj.weight])[symbol:model.layers.15.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%117:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=579), symbol:model.layers.15.self_attn.v_proj.weight])[symbol:model.layers.15.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%213:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=583), symbol:model.layers.15.self_attn.q_norm.weight])[symbol:model.layers.15.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%100:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), symbol:model.layers.15.self_attn.k_norm.weight])[symbol:model.layers.15.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%303:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=598), symbol:model.layers.15.self_attn.o_proj.weight])[symbol:model.layers.15.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%167:tensor<[2048], Float32, CPU>[@model.layers.15.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), symbol:model.layers.15.post_attention_layernorm.weight])[symbol:model.layers.15.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%260:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=602), symbol:model.layers.15.mlp.gate_proj.weight])[symbol:model.layers.15.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%42:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=605), symbol:model.layers.15.mlp.up_proj.weight])[symbol:model.layers.15.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%290:tensor<[2048, 6144], Float32, CPU>[@model.layers.15.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=607), symbol:model.layers.15.mlp.down_proj.weight])[symbol:model.layers.15.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%93:tensor<[2048], Float32, CPU>[@model.layers.16.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), symbol:model.layers.16.input_layernorm.weight])[symbol:model.layers.16.input_layernorm.weight]
-            tensor.CPU.register () -> (%17:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.q_proj.weight][symbol:model.layers.16.self_attn.q_proj.weight])[symbol:model.layers.16.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%228:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=611), symbol:model.layers.16.self_attn.k_proj.weight])[symbol:model.layers.16.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%66:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=613), symbol:model.layers.16.self_attn.v_proj.weight])[symbol:model.layers.16.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%240:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=617), symbol:model.layers.16.self_attn.q_norm.weight])[symbol:model.layers.16.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%306:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), symbol:model.layers.16.self_attn.k_norm.weight])[symbol:model.layers.16.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%211:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=632), symbol:model.layers.16.self_attn.o_proj.weight])[symbol:model.layers.16.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%210:tensor<[2048], Float32, CPU>[@model.layers.16.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), symbol:model.layers.16.post_attention_layernorm.weight])[symbol:model.layers.16.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%130:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=636), symbol:model.layers.16.mlp.gate_proj.weight])[symbol:model.layers.16.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%79:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=639), symbol:model.layers.16.mlp.up_proj.weight])[symbol:model.layers.16.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%248:tensor<[2048, 6144], Float32, CPU>[@model.layers.16.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=641), symbol:model.layers.16.mlp.down_proj.weight])[symbol:model.layers.16.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%231:tensor<[2048], Float32, CPU>[@model.layers.17.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), symbol:model.layers.17.input_layernorm.weight])[symbol:model.layers.17.input_layernorm.weight]
-            tensor.CPU.register () -> (%64:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.q_proj.weight][symbol:model.layers.17.self_attn.q_proj.weight])[symbol:model.layers.17.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%237:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=645), symbol:model.layers.17.self_attn.k_proj.weight])[symbol:model.layers.17.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%6:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=647), symbol:model.layers.17.self_attn.v_proj.weight])[symbol:model.layers.17.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%222:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=651), symbol:model.layers.17.self_attn.q_norm.weight])[symbol:model.layers.17.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%191:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), symbol:model.layers.17.self_attn.k_norm.weight])[symbol:model.layers.17.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%125:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=666), symbol:model.layers.17.self_attn.o_proj.weight])[symbol:model.layers.17.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%242:tensor<[2048], Float32, CPU>[@model.layers.17.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), symbol:model.layers.17.post_attention_layernorm.weight])[symbol:model.layers.17.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%177:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=670), symbol:model.layers.17.mlp.gate_proj.weight])[symbol:model.layers.17.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%26:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=673), symbol:model.layers.17.mlp.up_proj.weight])[symbol:model.layers.17.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%25:tensor<[2048, 6144], Float32, CPU>[@model.layers.17.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=675), symbol:model.layers.17.mlp.down_proj.weight])[symbol:model.layers.17.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%296:tensor<[2048], Float32, CPU>[@model.layers.18.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), symbol:model.layers.18.input_layernorm.weight])[symbol:model.layers.18.input_layernorm.weight]
-            tensor.CPU.register () -> (%273:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.q_proj.weight][symbol:model.layers.18.self_attn.q_proj.weight])[symbol:model.layers.18.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%284:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=679), symbol:model.layers.18.self_attn.k_proj.weight])[symbol:model.layers.18.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%18:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=681), symbol:model.layers.18.self_attn.v_proj.weight])[symbol:model.layers.18.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%51:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=685), symbol:model.layers.18.self_attn.q_norm.weight])[symbol:model.layers.18.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%21:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), symbol:model.layers.18.self_attn.k_norm.weight])[symbol:model.layers.18.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%2:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=700), symbol:model.layers.18.self_attn.o_proj.weight])[symbol:model.layers.18.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%10:tensor<[2048], Float32, CPU>[@model.layers.18.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), symbol:model.layers.18.post_attention_layernorm.weight])[symbol:model.layers.18.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%166:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=704), symbol:model.layers.18.mlp.gate_proj.weight])[symbol:model.layers.18.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%271:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=707), symbol:model.layers.18.mlp.up_proj.weight])[symbol:model.layers.18.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%112:tensor<[2048, 6144], Float32, CPU>[@model.layers.18.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=709), symbol:model.layers.18.mlp.down_proj.weight])[symbol:model.layers.18.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%113:tensor<[2048], Float32, CPU>[@model.layers.19.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), symbol:model.layers.19.input_layernorm.weight])[symbol:model.layers.19.input_layernorm.weight]
-            tensor.CPU.register () -> (%8:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.q_proj.weight][symbol:model.layers.19.self_attn.q_proj.weight])[symbol:model.layers.19.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%286:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=713), symbol:model.layers.19.self_attn.k_proj.weight])[symbol:model.layers.19.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%50:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=715), symbol:model.layers.19.self_attn.v_proj.weight])[symbol:model.layers.19.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%116:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=719), symbol:model.layers.19.self_attn.q_norm.weight])[symbol:model.layers.19.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%84:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), symbol:model.layers.19.self_attn.k_norm.weight])[symbol:model.layers.19.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%58:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=734), symbol:model.layers.19.self_attn.o_proj.weight])[symbol:model.layers.19.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%95:tensor<[2048], Float32, CPU>[@model.layers.19.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), symbol:model.layers.19.post_attention_layernorm.weight])[symbol:model.layers.19.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%281:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=738), symbol:model.layers.19.mlp.gate_proj.weight])[symbol:model.layers.19.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%82:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=741), symbol:model.layers.19.mlp.up_proj.weight])[symbol:model.layers.19.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%173:tensor<[2048, 6144], Float32, CPU>[@model.layers.19.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=743), symbol:model.layers.19.mlp.down_proj.weight])[symbol:model.layers.19.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%203:tensor<[2048], Float32, CPU>[@model.layers.20.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), symbol:model.layers.20.input_layernorm.weight])[symbol:model.layers.20.input_layernorm.weight]
-            tensor.CPU.register () -> (%280:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.q_proj.weight][symbol:model.layers.20.self_attn.q_proj.weight])[symbol:model.layers.20.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%253:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=747), symbol:model.layers.20.self_attn.k_proj.weight])[symbol:model.layers.20.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%239:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=749), symbol:model.layers.20.self_attn.v_proj.weight])[symbol:model.layers.20.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%143:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=753), symbol:model.layers.20.self_attn.q_norm.weight])[symbol:model.layers.20.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%288:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), symbol:model.layers.20.self_attn.k_norm.weight])[symbol:model.layers.20.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%41:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=768), symbol:model.layers.20.self_attn.o_proj.weight])[symbol:model.layers.20.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%216:tensor<[2048], Float32, CPU>[@model.layers.20.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), symbol:model.layers.20.post_attention_layernorm.weight])[symbol:model.layers.20.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%172:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=772), symbol:model.layers.20.mlp.gate_proj.weight])[symbol:model.layers.20.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%299:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=775), symbol:model.layers.20.mlp.up_proj.weight])[symbol:model.layers.20.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%123:tensor<[2048, 6144], Float32, CPU>[@model.layers.20.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=777), symbol:model.layers.20.mlp.down_proj.weight])[symbol:model.layers.20.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%229:tensor<[2048], Float32, CPU>[@model.layers.21.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), symbol:model.layers.21.input_layernorm.weight])[symbol:model.layers.21.input_layernorm.weight]
-            tensor.CPU.register () -> (%295:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.q_proj.weight][symbol:model.layers.21.self_attn.q_proj.weight])[symbol:model.layers.21.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%139:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=781), symbol:model.layers.21.self_attn.k_proj.weight])[symbol:model.layers.21.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%142:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=783), symbol:model.layers.21.self_attn.v_proj.weight])[symbol:model.layers.21.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%87:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=787), symbol:model.layers.21.self_attn.q_norm.weight])[symbol:model.layers.21.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%56:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), symbol:model.layers.21.self_attn.k_norm.weight])[symbol:model.layers.21.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%115:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=802), symbol:model.layers.21.self_attn.o_proj.weight])[symbol:model.layers.21.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%174:tensor<[2048], Float32, CPU>[@model.layers.21.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), symbol:model.layers.21.post_attention_layernorm.weight])[symbol:model.layers.21.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%259:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=806), symbol:model.layers.21.mlp.gate_proj.weight])[symbol:model.layers.21.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%162:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=809), symbol:model.layers.21.mlp.up_proj.weight])[symbol:model.layers.21.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%183:tensor<[2048, 6144], Float32, CPU>[@model.layers.21.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=811), symbol:model.layers.21.mlp.down_proj.weight])[symbol:model.layers.21.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%257:tensor<[2048], Float32, CPU>[@model.layers.22.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), symbol:model.layers.22.input_layernorm.weight])[symbol:model.layers.22.input_layernorm.weight]
-            tensor.CPU.register () -> (%89:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.q_proj.weight][symbol:model.layers.22.self_attn.q_proj.weight])[symbol:model.layers.22.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%36:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=815), symbol:model.layers.22.self_attn.k_proj.weight])[symbol:model.layers.22.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%204:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=817), symbol:model.layers.22.self_attn.v_proj.weight])[symbol:model.layers.22.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%158:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=821), symbol:model.layers.22.self_attn.q_norm.weight])[symbol:model.layers.22.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%215:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), symbol:model.layers.22.self_attn.k_norm.weight])[symbol:model.layers.22.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%234:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=836), symbol:model.layers.22.self_attn.o_proj.weight])[symbol:model.layers.22.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%270:tensor<[2048], Float32, CPU>[@model.layers.22.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), symbol:model.layers.22.post_attention_layernorm.weight])[symbol:model.layers.22.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%198:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=840), symbol:model.layers.22.mlp.gate_proj.weight])[symbol:model.layers.22.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%254:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=843), symbol:model.layers.22.mlp.up_proj.weight])[symbol:model.layers.22.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%31:tensor<[2048, 6144], Float32, CPU>[@model.layers.22.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=845), symbol:model.layers.22.mlp.down_proj.weight])[symbol:model.layers.22.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%292:tensor<[2048], Float32, CPU>[@model.layers.23.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), symbol:model.layers.23.input_layernorm.weight])[symbol:model.layers.23.input_layernorm.weight]
-            tensor.CPU.register () -> (%109:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.q_proj.weight][symbol:model.layers.23.self_attn.q_proj.weight])[symbol:model.layers.23.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%39:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=849), symbol:model.layers.23.self_attn.k_proj.weight])[symbol:model.layers.23.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%83:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=851), symbol:model.layers.23.self_attn.v_proj.weight])[symbol:model.layers.23.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%293:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=855), symbol:model.layers.23.self_attn.q_norm.weight])[symbol:model.layers.23.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%134:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), symbol:model.layers.23.self_attn.k_norm.weight])[symbol:model.layers.23.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%176:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=870), symbol:model.layers.23.self_attn.o_proj.weight])[symbol:model.layers.23.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%170:tensor<[2048], Float32, CPU>[@model.layers.23.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), symbol:model.layers.23.post_attention_layernorm.weight])[symbol:model.layers.23.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%169:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=874), symbol:model.layers.23.mlp.gate_proj.weight])[symbol:model.layers.23.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%243:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=877), symbol:model.layers.23.mlp.up_proj.weight])[symbol:model.layers.23.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%149:tensor<[2048, 6144], Float32, CPU>[@model.layers.23.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=879), symbol:model.layers.23.mlp.down_proj.weight])[symbol:model.layers.23.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%13:tensor<[2048], Float32, CPU>[@model.layers.24.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), symbol:model.layers.24.input_layernorm.weight])[symbol:model.layers.24.input_layernorm.weight]
-            tensor.CPU.register () -> (%11:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.q_proj.weight][symbol:model.layers.24.self_attn.q_proj.weight])[symbol:model.layers.24.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%61:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=883), symbol:model.layers.24.self_attn.k_proj.weight])[symbol:model.layers.24.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%81:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=885), symbol:model.layers.24.self_attn.v_proj.weight])[symbol:model.layers.24.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%90:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=889), symbol:model.layers.24.self_attn.q_norm.weight])[symbol:model.layers.24.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%19:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), symbol:model.layers.24.self_attn.k_norm.weight])[symbol:model.layers.24.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%127:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=904), symbol:model.layers.24.self_attn.o_proj.weight])[symbol:model.layers.24.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%77:tensor<[2048], Float32, CPU>[@model.layers.24.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), symbol:model.layers.24.post_attention_layernorm.weight])[symbol:model.layers.24.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%141:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=908), symbol:model.layers.24.mlp.gate_proj.weight])[symbol:model.layers.24.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%126:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=911), symbol:model.layers.24.mlp.up_proj.weight])[symbol:model.layers.24.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%34:tensor<[2048, 6144], Float32, CPU>[@model.layers.24.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=913), symbol:model.layers.24.mlp.down_proj.weight])[symbol:model.layers.24.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%196:tensor<[2048], Float32, CPU>[@model.layers.25.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), symbol:model.layers.25.input_layernorm.weight])[symbol:model.layers.25.input_layernorm.weight]
-            tensor.CPU.register () -> (%206:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.q_proj.weight][symbol:model.layers.25.self_attn.q_proj.weight])[symbol:model.layers.25.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%27:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=917), symbol:model.layers.25.self_attn.k_proj.weight])[symbol:model.layers.25.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%121:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=919), symbol:model.layers.25.self_attn.v_proj.weight])[symbol:model.layers.25.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%310:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=923), symbol:model.layers.25.self_attn.q_norm.weight])[symbol:model.layers.25.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%187:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), symbol:model.layers.25.self_attn.k_norm.weight])[symbol:model.layers.25.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%150:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=938), symbol:model.layers.25.self_attn.o_proj.weight])[symbol:model.layers.25.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%175:tensor<[2048], Float32, CPU>[@model.layers.25.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), symbol:model.layers.25.post_attention_layernorm.weight])[symbol:model.layers.25.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%249:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=942), symbol:model.layers.25.mlp.gate_proj.weight])[symbol:model.layers.25.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%159:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=945), symbol:model.layers.25.mlp.up_proj.weight])[symbol:model.layers.25.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%267:tensor<[2048, 6144], Float32, CPU>[@model.layers.25.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=947), symbol:model.layers.25.mlp.down_proj.weight])[symbol:model.layers.25.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%302:tensor<[2048], Float32, CPU>[@model.layers.26.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), symbol:model.layers.26.input_layernorm.weight])[symbol:model.layers.26.input_layernorm.weight]
-            tensor.CPU.register () -> (%265:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.q_proj.weight][symbol:model.layers.26.self_attn.q_proj.weight])[symbol:model.layers.26.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%190:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=951), symbol:model.layers.26.self_attn.k_proj.weight])[symbol:model.layers.26.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%119:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=953), symbol:model.layers.26.self_attn.v_proj.weight])[symbol:model.layers.26.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%70:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=957), symbol:model.layers.26.self_attn.q_norm.weight])[symbol:model.layers.26.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%35:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), symbol:model.layers.26.self_attn.k_norm.weight])[symbol:model.layers.26.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%88:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=972), symbol:model.layers.26.self_attn.o_proj.weight])[symbol:model.layers.26.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%298:tensor<[2048], Float32, CPU>[@model.layers.26.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), symbol:model.layers.26.post_attention_layernorm.weight])[symbol:model.layers.26.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%96:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=976), symbol:model.layers.26.mlp.gate_proj.weight])[symbol:model.layers.26.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%62:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=979), symbol:model.layers.26.mlp.up_proj.weight])[symbol:model.layers.26.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%220:tensor<[2048, 6144], Float32, CPU>[@model.layers.26.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=981), symbol:model.layers.26.mlp.down_proj.weight])[symbol:model.layers.26.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%44:tensor<[2048], Float32, CPU>[@model.layers.27.input_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), symbol:model.layers.27.input_layernorm.weight])[symbol:model.layers.27.input_layernorm.weight]
-            tensor.CPU.register () -> (%185:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.q_proj.weight][symbol:model.layers.27.self_attn.q_proj.weight])[symbol:model.layers.27.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%12:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=985), symbol:model.layers.27.self_attn.k_proj.weight])[symbol:model.layers.27.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%54:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=987), symbol:model.layers.27.self_attn.v_proj.weight])[symbol:model.layers.27.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%192:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.q_norm.weight][qnn_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=991), symbol:model.layers.27.self_attn.q_norm.weight])[symbol:model.layers.27.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%241:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.k_norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), symbol:model.layers.27.self_attn.k_norm.weight])[symbol:model.layers.27.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%60:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1006), symbol:model.layers.27.self_attn.o_proj.weight])[symbol:model.layers.27.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%104:tensor<[2048], Float32, CPU>[@model.layers.27.post_attention_layernorm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), symbol:model.layers.27.post_attention_layernorm.weight])[symbol:model.layers.27.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%144:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1010), symbol:model.layers.27.mlp.gate_proj.weight])[symbol:model.layers.27.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%146:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1013), symbol:model.layers.27.mlp.up_proj.weight])[symbol:model.layers.27.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%195:tensor<[2048, 6144], Float32, CPU>[@model.layers.27.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1015), symbol:model.layers.27.mlp.down_proj.weight])[symbol:model.layers.27.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%5:tensor<[2048], Float32, CPU>[@model.norm.weight][qnn_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018), symbol:model.norm.weight])[symbol:model.norm.weight]
-            tensor.CPU.register () -> (%101:tensor<[151936, 2048], Float32, CPU>[@lm_head.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1019), symbol:lm_head.weight])[symbol:lm_head.weight]
+            tensor.CPU.register () -> (%7516:tensor<[151936, 2048], Float32, CPU>[@model.embed_tokens.weight][quant_recipe:QuantSpec(Raw(type: Float32), uuid=61), symbol:model.embed_tokens.weight])[symbol:model.embed_tokens.weight]
+            tensor.CPU.register () -> (%8011:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_sin][symbol:rope_sin])[symbol:rope_sin]
+            tensor.CPU.register () -> (%8012:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_cos][symbol:rope_cos])[symbol:rope_cos]
+            tensor.CPU.register () -> (%6662:tensor<[2048], Float32, CPU>[@model.layers.0.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), symbol:model.layers.0.input_layernorm.weight])[symbol:model.layers.0.input_layernorm.weight]
+            tensor.CPU.register () -> (%7778:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.q_proj.weight][symbol:model.layers.0.self_attn.q_proj.weight])[symbol:model.layers.0.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%61:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68), symbol:model.layers.0.self_attn.k_proj.weight])[symbol:model.layers.0.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5178:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70), symbol:model.layers.0.self_attn.v_proj.weight])[symbol:model.layers.0.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1867:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=74), symbol:model.layers.0.self_attn.q_norm.weight])[symbol:model.layers.0.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7469:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=76), symbol:model.layers.0.self_attn.k_norm.weight])[symbol:model.layers.0.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7880:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=89), symbol:model.layers.0.self_attn.o_proj.weight])[symbol:model.layers.0.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3163:tensor<[2048], Float32, CPU>[@model.layers.0.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92), symbol:model.layers.0.post_attention_layernorm.weight])[symbol:model.layers.0.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3038:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=93), symbol:model.layers.0.mlp.gate_proj.weight])[symbol:model.layers.0.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%184:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96), symbol:model.layers.0.mlp.up_proj.weight])[symbol:model.layers.0.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%7449:tensor<[2048, 6144], Float32, CPU>[@model.layers.0.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=98), symbol:model.layers.0.mlp.down_proj.weight])[symbol:model.layers.0.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3526:tensor<[2048], Float32, CPU>[@model.layers.1.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101), symbol:model.layers.1.input_layernorm.weight])[symbol:model.layers.1.input_layernorm.weight]
+            tensor.CPU.register () -> (%2471:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.q_proj.weight][symbol:model.layers.1.self_attn.q_proj.weight])[symbol:model.layers.1.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%5492:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=102), symbol:model.layers.1.self_attn.k_proj.weight])[symbol:model.layers.1.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%554:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=104), symbol:model.layers.1.self_attn.v_proj.weight])[symbol:model.layers.1.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5159:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=108), symbol:model.layers.1.self_attn.q_norm.weight])[symbol:model.layers.1.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%6337:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=110), symbol:model.layers.1.self_attn.k_norm.weight])[symbol:model.layers.1.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3431:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=123), symbol:model.layers.1.self_attn.o_proj.weight])[symbol:model.layers.1.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%7183:tensor<[2048], Float32, CPU>[@model.layers.1.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=126), symbol:model.layers.1.post_attention_layernorm.weight])[symbol:model.layers.1.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6960:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=127), symbol:model.layers.1.mlp.gate_proj.weight])[symbol:model.layers.1.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%7251:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=130), symbol:model.layers.1.mlp.up_proj.weight])[symbol:model.layers.1.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6256:tensor<[2048, 6144], Float32, CPU>[@model.layers.1.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=132), symbol:model.layers.1.mlp.down_proj.weight])[symbol:model.layers.1.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%7411:tensor<[2048], Float32, CPU>[@model.layers.2.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=135), symbol:model.layers.2.input_layernorm.weight])[symbol:model.layers.2.input_layernorm.weight]
+            tensor.CPU.register () -> (%4879:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.q_proj.weight][symbol:model.layers.2.self_attn.q_proj.weight])[symbol:model.layers.2.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%725:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=136), symbol:model.layers.2.self_attn.k_proj.weight])[symbol:model.layers.2.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2701:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138), symbol:model.layers.2.self_attn.v_proj.weight])[symbol:model.layers.2.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7660:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=142), symbol:model.layers.2.self_attn.q_norm.weight])[symbol:model.layers.2.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5749:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), symbol:model.layers.2.self_attn.k_norm.weight])[symbol:model.layers.2.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%1525:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=157), symbol:model.layers.2.self_attn.o_proj.weight])[symbol:model.layers.2.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6444:tensor<[2048], Float32, CPU>[@model.layers.2.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=160), symbol:model.layers.2.post_attention_layernorm.weight])[symbol:model.layers.2.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3201:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=161), symbol:model.layers.2.mlp.gate_proj.weight])[symbol:model.layers.2.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%4120:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164), symbol:model.layers.2.mlp.up_proj.weight])[symbol:model.layers.2.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%1962:tensor<[2048, 6144], Float32, CPU>[@model.layers.2.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166), symbol:model.layers.2.mlp.down_proj.weight])[symbol:model.layers.2.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3250:tensor<[2048], Float32, CPU>[@model.layers.3.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), symbol:model.layers.3.input_layernorm.weight])[symbol:model.layers.3.input_layernorm.weight]
+            tensor.CPU.register () -> (%5564:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.q_proj.weight][symbol:model.layers.3.self_attn.q_proj.weight])[symbol:model.layers.3.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%3502:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=170), symbol:model.layers.3.self_attn.k_proj.weight])[symbol:model.layers.3.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2402:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=172), symbol:model.layers.3.self_attn.v_proj.weight])[symbol:model.layers.3.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1747:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=176), symbol:model.layers.3.self_attn.q_norm.weight])[symbol:model.layers.3.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%4846:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=178), symbol:model.layers.3.self_attn.k_norm.weight])[symbol:model.layers.3.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3109:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=191), symbol:model.layers.3.self_attn.o_proj.weight])[symbol:model.layers.3.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%7221:tensor<[2048], Float32, CPU>[@model.layers.3.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=194), symbol:model.layers.3.post_attention_layernorm.weight])[symbol:model.layers.3.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%7181:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195), symbol:model.layers.3.mlp.gate_proj.weight])[symbol:model.layers.3.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2714:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=198), symbol:model.layers.3.mlp.up_proj.weight])[symbol:model.layers.3.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4573:tensor<[2048, 6144], Float32, CPU>[@model.layers.3.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=200), symbol:model.layers.3.mlp.down_proj.weight])[symbol:model.layers.3.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5536:tensor<[2048], Float32, CPU>[@model.layers.4.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203), symbol:model.layers.4.input_layernorm.weight])[symbol:model.layers.4.input_layernorm.weight]
+            tensor.CPU.register () -> (%463:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.q_proj.weight][symbol:model.layers.4.self_attn.q_proj.weight])[symbol:model.layers.4.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%5989:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204), symbol:model.layers.4.self_attn.k_proj.weight])[symbol:model.layers.4.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3443:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=206), symbol:model.layers.4.self_attn.v_proj.weight])[symbol:model.layers.4.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%926:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=210), symbol:model.layers.4.self_attn.q_norm.weight])[symbol:model.layers.4.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5648:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212), symbol:model.layers.4.self_attn.k_norm.weight])[symbol:model.layers.4.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%256:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=225), symbol:model.layers.4.self_attn.o_proj.weight])[symbol:model.layers.4.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3101:tensor<[2048], Float32, CPU>[@model.layers.4.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=228), symbol:model.layers.4.post_attention_layernorm.weight])[symbol:model.layers.4.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%15:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=229), symbol:model.layers.4.mlp.gate_proj.weight])[symbol:model.layers.4.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%3494:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=232), symbol:model.layers.4.mlp.up_proj.weight])[symbol:model.layers.4.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6518:tensor<[2048, 6144], Float32, CPU>[@model.layers.4.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234), symbol:model.layers.4.mlp.down_proj.weight])[symbol:model.layers.4.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%7246:tensor<[2048], Float32, CPU>[@model.layers.5.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237), symbol:model.layers.5.input_layernorm.weight])[symbol:model.layers.5.input_layernorm.weight]
+            tensor.CPU.register () -> (%3752:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.q_proj.weight][symbol:model.layers.5.self_attn.q_proj.weight])[symbol:model.layers.5.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2143:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=238), symbol:model.layers.5.self_attn.k_proj.weight])[symbol:model.layers.5.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5753:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=240), symbol:model.layers.5.self_attn.v_proj.weight])[symbol:model.layers.5.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%4774:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=244), symbol:model.layers.5.self_attn.q_norm.weight])[symbol:model.layers.5.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1215:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=246), symbol:model.layers.5.self_attn.k_norm.weight])[symbol:model.layers.5.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%2076:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=259), symbol:model.layers.5.self_attn.o_proj.weight])[symbol:model.layers.5.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6883:tensor<[2048], Float32, CPU>[@model.layers.5.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=262), symbol:model.layers.5.post_attention_layernorm.weight])[symbol:model.layers.5.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5485:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=263), symbol:model.layers.5.mlp.gate_proj.weight])[symbol:model.layers.5.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%759:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=266), symbol:model.layers.5.mlp.up_proj.weight])[symbol:model.layers.5.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6315:tensor<[2048, 6144], Float32, CPU>[@model.layers.5.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=268), symbol:model.layers.5.mlp.down_proj.weight])[symbol:model.layers.5.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%7090:tensor<[2048], Float32, CPU>[@model.layers.6.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=271), symbol:model.layers.6.input_layernorm.weight])[symbol:model.layers.6.input_layernorm.weight]
+            tensor.CPU.register () -> (%3125:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.q_proj.weight][symbol:model.layers.6.self_attn.q_proj.weight])[symbol:model.layers.6.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1798:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=272), symbol:model.layers.6.self_attn.k_proj.weight])[symbol:model.layers.6.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%1047:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274), symbol:model.layers.6.self_attn.v_proj.weight])[symbol:model.layers.6.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7385:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=278), symbol:model.layers.6.self_attn.q_norm.weight])[symbol:model.layers.6.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5603:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=280), symbol:model.layers.6.self_attn.k_norm.weight])[symbol:model.layers.6.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6862:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=293), symbol:model.layers.6.self_attn.o_proj.weight])[symbol:model.layers.6.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4161:tensor<[2048], Float32, CPU>[@model.layers.6.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), symbol:model.layers.6.post_attention_layernorm.weight])[symbol:model.layers.6.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5295:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=297), symbol:model.layers.6.mlp.gate_proj.weight])[symbol:model.layers.6.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%4710:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300), symbol:model.layers.6.mlp.up_proj.weight])[symbol:model.layers.6.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4929:tensor<[2048, 6144], Float32, CPU>[@model.layers.6.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=302), symbol:model.layers.6.mlp.down_proj.weight])[symbol:model.layers.6.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4605:tensor<[2048], Float32, CPU>[@model.layers.7.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305), symbol:model.layers.7.input_layernorm.weight])[symbol:model.layers.7.input_layernorm.weight]
+            tensor.CPU.register () -> (%4585:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.q_proj.weight][symbol:model.layers.7.self_attn.q_proj.weight])[symbol:model.layers.7.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306), symbol:model.layers.7.self_attn.k_proj.weight])[symbol:model.layers.7.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2341:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=308), symbol:model.layers.7.self_attn.v_proj.weight])[symbol:model.layers.7.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5151:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=312), symbol:model.layers.7.self_attn.q_norm.weight])[symbol:model.layers.7.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3437:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=314), symbol:model.layers.7.self_attn.k_norm.weight])[symbol:model.layers.7.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3368:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=327), symbol:model.layers.7.self_attn.o_proj.weight])[symbol:model.layers.7.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%68:tensor<[2048], Float32, CPU>[@model.layers.7.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330), symbol:model.layers.7.post_attention_layernorm.weight])[symbol:model.layers.7.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%324:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=331), symbol:model.layers.7.mlp.gate_proj.weight])[symbol:model.layers.7.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5551:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=334), symbol:model.layers.7.mlp.up_proj.weight])[symbol:model.layers.7.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%7894:tensor<[2048, 6144], Float32, CPU>[@model.layers.7.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=336), symbol:model.layers.7.mlp.down_proj.weight])[symbol:model.layers.7.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3851:tensor<[2048], Float32, CPU>[@model.layers.8.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), symbol:model.layers.8.input_layernorm.weight])[symbol:model.layers.8.input_layernorm.weight]
+            tensor.CPU.register () -> (%5874:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.q_proj.weight][symbol:model.layers.8.self_attn.q_proj.weight])[symbol:model.layers.8.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1863:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=340), symbol:model.layers.8.self_attn.k_proj.weight])[symbol:model.layers.8.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3204:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=342), symbol:model.layers.8.self_attn.v_proj.weight])[symbol:model.layers.8.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2301:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=346), symbol:model.layers.8.self_attn.q_norm.weight])[symbol:model.layers.8.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7373:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=348), symbol:model.layers.8.self_attn.k_norm.weight])[symbol:model.layers.8.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6303:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=361), symbol:model.layers.8.self_attn.o_proj.weight])[symbol:model.layers.8.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%1997:tensor<[2048], Float32, CPU>[@model.layers.8.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), symbol:model.layers.8.post_attention_layernorm.weight])[symbol:model.layers.8.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6731:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=365), symbol:model.layers.8.mlp.gate_proj.weight])[symbol:model.layers.8.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5478:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=368), symbol:model.layers.8.mlp.up_proj.weight])[symbol:model.layers.8.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4734:tensor<[2048, 6144], Float32, CPU>[@model.layers.8.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370), symbol:model.layers.8.mlp.down_proj.weight])[symbol:model.layers.8.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4963:tensor<[2048], Float32, CPU>[@model.layers.9.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=373), symbol:model.layers.9.input_layernorm.weight])[symbol:model.layers.9.input_layernorm.weight]
+            tensor.CPU.register () -> (%137:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.q_proj.weight][symbol:model.layers.9.self_attn.q_proj.weight])[symbol:model.layers.9.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2689:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374), symbol:model.layers.9.self_attn.k_proj.weight])[symbol:model.layers.9.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4027:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376), symbol:model.layers.9.self_attn.v_proj.weight])[symbol:model.layers.9.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1375:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=380), symbol:model.layers.9.self_attn.q_norm.weight])[symbol:model.layers.9.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%4962:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=382), symbol:model.layers.9.self_attn.k_norm.weight])[symbol:model.layers.9.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6399:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=395), symbol:model.layers.9.self_attn.o_proj.weight])[symbol:model.layers.9.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2594:tensor<[2048], Float32, CPU>[@model.layers.9.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=398), symbol:model.layers.9.post_attention_layernorm.weight])[symbol:model.layers.9.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3833:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=399), symbol:model.layers.9.mlp.gate_proj.weight])[symbol:model.layers.9.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2358:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=402), symbol:model.layers.9.mlp.up_proj.weight])[symbol:model.layers.9.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%3947:tensor<[2048, 6144], Float32, CPU>[@model.layers.9.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=404), symbol:model.layers.9.mlp.down_proj.weight])[symbol:model.layers.9.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3229:tensor<[2048], Float32, CPU>[@model.layers.10.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), symbol:model.layers.10.input_layernorm.weight])[symbol:model.layers.10.input_layernorm.weight]
+            tensor.CPU.register () -> (%5022:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.q_proj.weight][symbol:model.layers.10.self_attn.q_proj.weight])[symbol:model.layers.10.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2867:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=408), symbol:model.layers.10.self_attn.k_proj.weight])[symbol:model.layers.10.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%567:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=410), symbol:model.layers.10.self_attn.v_proj.weight])[symbol:model.layers.10.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7008:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=414), symbol:model.layers.10.self_attn.q_norm.weight])[symbol:model.layers.10.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%6953:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), symbol:model.layers.10.self_attn.k_norm.weight])[symbol:model.layers.10.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%5479:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=429), symbol:model.layers.10.self_attn.o_proj.weight])[symbol:model.layers.10.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3177:tensor<[2048], Float32, CPU>[@model.layers.10.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432), symbol:model.layers.10.post_attention_layernorm.weight])[symbol:model.layers.10.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%7857:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=433), symbol:model.layers.10.mlp.gate_proj.weight])[symbol:model.layers.10.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%3620:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=436), symbol:model.layers.10.mlp.up_proj.weight])[symbol:model.layers.10.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4172:tensor<[2048, 6144], Float32, CPU>[@model.layers.10.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=438), symbol:model.layers.10.mlp.down_proj.weight])[symbol:model.layers.10.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1820:tensor<[2048], Float32, CPU>[@model.layers.11.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=441), symbol:model.layers.11.input_layernorm.weight])[symbol:model.layers.11.input_layernorm.weight]
+            tensor.CPU.register () -> (%4375:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.q_proj.weight][symbol:model.layers.11.self_attn.q_proj.weight])[symbol:model.layers.11.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%3805:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=442), symbol:model.layers.11.self_attn.k_proj.weight])[symbol:model.layers.11.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5348:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444), symbol:model.layers.11.self_attn.v_proj.weight])[symbol:model.layers.11.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1018:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=448), symbol:model.layers.11.self_attn.q_norm.weight])[symbol:model.layers.11.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5323:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450), symbol:model.layers.11.self_attn.k_norm.weight])[symbol:model.layers.11.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6587:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=463), symbol:model.layers.11.self_attn.o_proj.weight])[symbol:model.layers.11.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2072:tensor<[2048], Float32, CPU>[@model.layers.11.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=466), symbol:model.layers.11.post_attention_layernorm.weight])[symbol:model.layers.11.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5180:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=467), symbol:model.layers.11.mlp.gate_proj.weight])[symbol:model.layers.11.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%1917:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=470), symbol:model.layers.11.mlp.up_proj.weight])[symbol:model.layers.11.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2810:tensor<[2048, 6144], Float32, CPU>[@model.layers.11.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=472), symbol:model.layers.11.mlp.down_proj.weight])[symbol:model.layers.11.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4945:tensor<[2048], Float32, CPU>[@model.layers.12.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=475), symbol:model.layers.12.input_layernorm.weight])[symbol:model.layers.12.input_layernorm.weight]
+            tensor.CPU.register () -> (%6926:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.q_proj.weight][symbol:model.layers.12.self_attn.q_proj.weight])[symbol:model.layers.12.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2741:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=476), symbol:model.layers.12.self_attn.k_proj.weight])[symbol:model.layers.12.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3690:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=478), symbol:model.layers.12.self_attn.v_proj.weight])[symbol:model.layers.12.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5447:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=482), symbol:model.layers.12.self_attn.q_norm.weight])[symbol:model.layers.12.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5437:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), symbol:model.layers.12.self_attn.k_norm.weight])[symbol:model.layers.12.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4785:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=497), symbol:model.layers.12.self_attn.o_proj.weight])[symbol:model.layers.12.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%1343:tensor<[2048], Float32, CPU>[@model.layers.12.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=500), symbol:model.layers.12.post_attention_layernorm.weight])[symbol:model.layers.12.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3306:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=501), symbol:model.layers.12.mlp.gate_proj.weight])[symbol:model.layers.12.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2123:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=504), symbol:model.layers.12.mlp.up_proj.weight])[symbol:model.layers.12.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2005:tensor<[2048, 6144], Float32, CPU>[@model.layers.12.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=506), symbol:model.layers.12.mlp.down_proj.weight])[symbol:model.layers.12.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1812:tensor<[2048], Float32, CPU>[@model.layers.13.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509), symbol:model.layers.13.input_layernorm.weight])[symbol:model.layers.13.input_layernorm.weight]
+            tensor.CPU.register () -> (%7043:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.q_proj.weight][symbol:model.layers.13.self_attn.q_proj.weight])[symbol:model.layers.13.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%229:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510), symbol:model.layers.13.self_attn.k_proj.weight])[symbol:model.layers.13.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%1019:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=512), symbol:model.layers.13.self_attn.v_proj.weight])[symbol:model.layers.13.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%3318:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=516), symbol:model.layers.13.self_attn.q_norm.weight])[symbol:model.layers.13.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%2503:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=518), symbol:model.layers.13.self_attn.k_norm.weight])[symbol:model.layers.13.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3883:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=531), symbol:model.layers.13.self_attn.o_proj.weight])[symbol:model.layers.13.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6904:tensor<[2048], Float32, CPU>[@model.layers.13.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), symbol:model.layers.13.post_attention_layernorm.weight])[symbol:model.layers.13.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5444:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=535), symbol:model.layers.13.mlp.gate_proj.weight])[symbol:model.layers.13.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%3100:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=538), symbol:model.layers.13.mlp.up_proj.weight])[symbol:model.layers.13.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6631:tensor<[2048, 6144], Float32, CPU>[@model.layers.13.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=540), symbol:model.layers.13.mlp.down_proj.weight])[symbol:model.layers.13.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5555:tensor<[2048], Float32, CPU>[@model.layers.14.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=543), symbol:model.layers.14.input_layernorm.weight])[symbol:model.layers.14.input_layernorm.weight]
+            tensor.CPU.register () -> (%1210:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.q_proj.weight][symbol:model.layers.14.self_attn.q_proj.weight])[symbol:model.layers.14.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%3756:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=544), symbol:model.layers.14.self_attn.k_proj.weight])[symbol:model.layers.14.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5243:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=546), symbol:model.layers.14.self_attn.v_proj.weight])[symbol:model.layers.14.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%3796:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550), symbol:model.layers.14.self_attn.q_norm.weight])[symbol:model.layers.14.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3974:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552), symbol:model.layers.14.self_attn.k_norm.weight])[symbol:model.layers.14.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3797:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=565), symbol:model.layers.14.self_attn.o_proj.weight])[symbol:model.layers.14.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4508:tensor<[2048], Float32, CPU>[@model.layers.14.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=568), symbol:model.layers.14.post_attention_layernorm.weight])[symbol:model.layers.14.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%7092:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=569), symbol:model.layers.14.mlp.gate_proj.weight])[symbol:model.layers.14.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%7164:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=572), symbol:model.layers.14.mlp.up_proj.weight])[symbol:model.layers.14.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4419:tensor<[2048, 6144], Float32, CPU>[@model.layers.14.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=574), symbol:model.layers.14.mlp.down_proj.weight])[symbol:model.layers.14.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5590:tensor<[2048], Float32, CPU>[@model.layers.15.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), symbol:model.layers.15.input_layernorm.weight])[symbol:model.layers.15.input_layernorm.weight]
+            tensor.CPU.register () -> (%5843:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.q_proj.weight][symbol:model.layers.15.self_attn.q_proj.weight])[symbol:model.layers.15.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%938:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=578), symbol:model.layers.15.self_attn.k_proj.weight])[symbol:model.layers.15.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3967:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580), symbol:model.layers.15.self_attn.v_proj.weight])[symbol:model.layers.15.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%3289:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=584), symbol:model.layers.15.self_attn.q_norm.weight])[symbol:model.layers.15.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%6756:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=586), symbol:model.layers.15.self_attn.k_norm.weight])[symbol:model.layers.15.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4838:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=599), symbol:model.layers.15.self_attn.o_proj.weight])[symbol:model.layers.15.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6774:tensor<[2048], Float32, CPU>[@model.layers.15.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602), symbol:model.layers.15.post_attention_layernorm.weight])[symbol:model.layers.15.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2819:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=603), symbol:model.layers.15.mlp.gate_proj.weight])[symbol:model.layers.15.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%1377:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=606), symbol:model.layers.15.mlp.up_proj.weight])[symbol:model.layers.15.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%526:tensor<[2048, 6144], Float32, CPU>[@model.layers.15.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=608), symbol:model.layers.15.mlp.down_proj.weight])[symbol:model.layers.15.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%369:tensor<[2048], Float32, CPU>[@model.layers.16.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611), symbol:model.layers.16.input_layernorm.weight])[symbol:model.layers.16.input_layernorm.weight]
+            tensor.CPU.register () -> (%2345:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.q_proj.weight][symbol:model.layers.16.self_attn.q_proj.weight])[symbol:model.layers.16.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%3022:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=612), symbol:model.layers.16.self_attn.k_proj.weight])[symbol:model.layers.16.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2931:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=614), symbol:model.layers.16.self_attn.v_proj.weight])[symbol:model.layers.16.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1150:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=618), symbol:model.layers.16.self_attn.q_norm.weight])[symbol:model.layers.16.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5521:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=620), symbol:model.layers.16.self_attn.k_norm.weight])[symbol:model.layers.16.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%672:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=633), symbol:model.layers.16.self_attn.o_proj.weight])[symbol:model.layers.16.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6793:tensor<[2048], Float32, CPU>[@model.layers.16.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=636), symbol:model.layers.16.post_attention_layernorm.weight])[symbol:model.layers.16.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%993:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=637), symbol:model.layers.16.mlp.gate_proj.weight])[symbol:model.layers.16.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%226:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=640), symbol:model.layers.16.mlp.up_proj.weight])[symbol:model.layers.16.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%7287:tensor<[2048, 6144], Float32, CPU>[@model.layers.16.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=642), symbol:model.layers.16.mlp.down_proj.weight])[symbol:model.layers.16.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%7811:tensor<[2048], Float32, CPU>[@model.layers.17.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=645), symbol:model.layers.17.input_layernorm.weight])[symbol:model.layers.17.input_layernorm.weight]
+            tensor.CPU.register () -> (%5758:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.q_proj.weight][symbol:model.layers.17.self_attn.q_proj.weight])[symbol:model.layers.17.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2828:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=646), symbol:model.layers.17.self_attn.k_proj.weight])[symbol:model.layers.17.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%417:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=648), symbol:model.layers.17.self_attn.v_proj.weight])[symbol:model.layers.17.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%59:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=652), symbol:model.layers.17.self_attn.q_norm.weight])[symbol:model.layers.17.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7588:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), symbol:model.layers.17.self_attn.k_norm.weight])[symbol:model.layers.17.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%5285:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667), symbol:model.layers.17.self_attn.o_proj.weight])[symbol:model.layers.17.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3787:tensor<[2048], Float32, CPU>[@model.layers.17.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=670), symbol:model.layers.17.post_attention_layernorm.weight])[symbol:model.layers.17.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%4841:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=671), symbol:model.layers.17.mlp.gate_proj.weight])[symbol:model.layers.17.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%4784:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=674), symbol:model.layers.17.mlp.up_proj.weight])[symbol:model.layers.17.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%1908:tensor<[2048, 6144], Float32, CPU>[@model.layers.17.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=676), symbol:model.layers.17.mlp.down_proj.weight])[symbol:model.layers.17.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%310:tensor<[2048], Float32, CPU>[@model.layers.18.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), symbol:model.layers.18.input_layernorm.weight])[symbol:model.layers.18.input_layernorm.weight]
+            tensor.CPU.register () -> (%7352:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.q_proj.weight][symbol:model.layers.18.self_attn.q_proj.weight])[symbol:model.layers.18.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6436:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=680), symbol:model.layers.18.self_attn.k_proj.weight])[symbol:model.layers.18.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%6164:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=682), symbol:model.layers.18.self_attn.v_proj.weight])[symbol:model.layers.18.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2747:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=686), symbol:model.layers.18.self_attn.q_norm.weight])[symbol:model.layers.18.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5281:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=688), symbol:model.layers.18.self_attn.k_norm.weight])[symbol:model.layers.18.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7646:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=701), symbol:model.layers.18.self_attn.o_proj.weight])[symbol:model.layers.18.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2540:tensor<[2048], Float32, CPU>[@model.layers.18.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=704), symbol:model.layers.18.post_attention_layernorm.weight])[symbol:model.layers.18.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6101:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=705), symbol:model.layers.18.mlp.gate_proj.weight])[symbol:model.layers.18.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2195:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=708), symbol:model.layers.18.mlp.up_proj.weight])[symbol:model.layers.18.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%3651:tensor<[2048, 6144], Float32, CPU>[@model.layers.18.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=710), symbol:model.layers.18.mlp.down_proj.weight])[symbol:model.layers.18.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3722:tensor<[2048], Float32, CPU>[@model.layers.19.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713), symbol:model.layers.19.input_layernorm.weight])[symbol:model.layers.19.input_layernorm.weight]
+            tensor.CPU.register () -> (%1141:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.q_proj.weight][symbol:model.layers.19.self_attn.q_proj.weight])[symbol:model.layers.19.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%651:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=714), symbol:model.layers.19.self_attn.k_proj.weight])[symbol:model.layers.19.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%254:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=716), symbol:model.layers.19.self_attn.v_proj.weight])[symbol:model.layers.19.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%610:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=720), symbol:model.layers.19.self_attn.q_norm.weight])[symbol:model.layers.19.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3691:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722), symbol:model.layers.19.self_attn.k_norm.weight])[symbol:model.layers.19.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7002:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735), symbol:model.layers.19.self_attn.o_proj.weight])[symbol:model.layers.19.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3446:tensor<[2048], Float32, CPU>[@model.layers.19.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=738), symbol:model.layers.19.post_attention_layernorm.weight])[symbol:model.layers.19.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2118:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=739), symbol:model.layers.19.mlp.gate_proj.weight])[symbol:model.layers.19.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%283:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=742), symbol:model.layers.19.mlp.up_proj.weight])[symbol:model.layers.19.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%1264:tensor<[2048, 6144], Float32, CPU>[@model.layers.19.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=744), symbol:model.layers.19.mlp.down_proj.weight])[symbol:model.layers.19.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5183:tensor<[2048], Float32, CPU>[@model.layers.20.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747), symbol:model.layers.20.input_layernorm.weight])[symbol:model.layers.20.input_layernorm.weight]
+            tensor.CPU.register () -> (%6004:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.q_proj.weight][symbol:model.layers.20.self_attn.q_proj.weight])[symbol:model.layers.20.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%4764:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=748), symbol:model.layers.20.self_attn.k_proj.weight])[symbol:model.layers.20.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3516:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=750), symbol:model.layers.20.self_attn.v_proj.weight])[symbol:model.layers.20.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2042:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=754), symbol:model.layers.20.self_attn.q_norm.weight])[symbol:model.layers.20.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1646:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=756), symbol:model.layers.20.self_attn.k_norm.weight])[symbol:model.layers.20.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3587:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=769), symbol:model.layers.20.self_attn.o_proj.weight])[symbol:model.layers.20.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2726:tensor<[2048], Float32, CPU>[@model.layers.20.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=772), symbol:model.layers.20.post_attention_layernorm.weight])[symbol:model.layers.20.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3656:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=773), symbol:model.layers.20.mlp.gate_proj.weight])[symbol:model.layers.20.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%802:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=776), symbol:model.layers.20.mlp.up_proj.weight])[symbol:model.layers.20.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%62:tensor<[2048, 6144], Float32, CPU>[@model.layers.20.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=778), symbol:model.layers.20.mlp.down_proj.weight])[symbol:model.layers.20.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1237:tensor<[2048], Float32, CPU>[@model.layers.21.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=781), symbol:model.layers.21.input_layernorm.weight])[symbol:model.layers.21.input_layernorm.weight]
+            tensor.CPU.register () -> (%2397:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.q_proj.weight][symbol:model.layers.21.self_attn.q_proj.weight])[symbol:model.layers.21.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%7562:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=782), symbol:model.layers.21.self_attn.k_proj.weight])[symbol:model.layers.21.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4665:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=784), symbol:model.layers.21.self_attn.v_proj.weight])[symbol:model.layers.21.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6195:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=788), symbol:model.layers.21.self_attn.q_norm.weight])[symbol:model.layers.21.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%701:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=790), symbol:model.layers.21.self_attn.k_norm.weight])[symbol:model.layers.21.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%5913:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803), symbol:model.layers.21.self_attn.o_proj.weight])[symbol:model.layers.21.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4765:tensor<[2048], Float32, CPU>[@model.layers.21.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), symbol:model.layers.21.post_attention_layernorm.weight])[symbol:model.layers.21.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%864:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807), symbol:model.layers.21.mlp.gate_proj.weight])[symbol:model.layers.21.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%923:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=810), symbol:model.layers.21.mlp.up_proj.weight])[symbol:model.layers.21.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6934:tensor<[2048, 6144], Float32, CPU>[@model.layers.21.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=812), symbol:model.layers.21.mlp.down_proj.weight])[symbol:model.layers.21.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%425:tensor<[2048], Float32, CPU>[@model.layers.22.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815), symbol:model.layers.22.input_layernorm.weight])[symbol:model.layers.22.input_layernorm.weight]
+            tensor.CPU.register () -> (%1036:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.q_proj.weight][symbol:model.layers.22.self_attn.q_proj.weight])[symbol:model.layers.22.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6990:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=816), symbol:model.layers.22.self_attn.k_proj.weight])[symbol:model.layers.22.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2703:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=818), symbol:model.layers.22.self_attn.v_proj.weight])[symbol:model.layers.22.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1995:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=822), symbol:model.layers.22.self_attn.q_norm.weight])[symbol:model.layers.22.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%2702:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=824), symbol:model.layers.22.self_attn.k_norm.weight])[symbol:model.layers.22.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%2221:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=837), symbol:model.layers.22.self_attn.o_proj.weight])[symbol:model.layers.22.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%5286:tensor<[2048], Float32, CPU>[@model.layers.22.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840), symbol:model.layers.22.post_attention_layernorm.weight])[symbol:model.layers.22.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%7377:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=841), symbol:model.layers.22.mlp.gate_proj.weight])[symbol:model.layers.22.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%694:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=844), symbol:model.layers.22.mlp.up_proj.weight])[symbol:model.layers.22.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%1401:tensor<[2048, 6144], Float32, CPU>[@model.layers.22.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=846), symbol:model.layers.22.mlp.down_proj.weight])[symbol:model.layers.22.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%809:tensor<[2048], Float32, CPU>[@model.layers.23.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), symbol:model.layers.23.input_layernorm.weight])[symbol:model.layers.23.input_layernorm.weight]
+            tensor.CPU.register () -> (%2936:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.q_proj.weight][symbol:model.layers.23.self_attn.q_proj.weight])[symbol:model.layers.23.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%577:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=850), symbol:model.layers.23.self_attn.k_proj.weight])[symbol:model.layers.23.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5308:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=852), symbol:model.layers.23.self_attn.v_proj.weight])[symbol:model.layers.23.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5454:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=856), symbol:model.layers.23.self_attn.q_norm.weight])[symbol:model.layers.23.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1089:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=858), symbol:model.layers.23.self_attn.k_norm.weight])[symbol:model.layers.23.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4076:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=871), symbol:model.layers.23.self_attn.o_proj.weight])[symbol:model.layers.23.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4535:tensor<[2048], Float32, CPU>[@model.layers.23.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), symbol:model.layers.23.post_attention_layernorm.weight])[symbol:model.layers.23.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%7750:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875), symbol:model.layers.23.mlp.gate_proj.weight])[symbol:model.layers.23.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%4744:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=878), symbol:model.layers.23.mlp.up_proj.weight])[symbol:model.layers.23.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2933:tensor<[2048, 6144], Float32, CPU>[@model.layers.23.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=880), symbol:model.layers.23.mlp.down_proj.weight])[symbol:model.layers.23.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1154:tensor<[2048], Float32, CPU>[@model.layers.24.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=883), symbol:model.layers.24.input_layernorm.weight])[symbol:model.layers.24.input_layernorm.weight]
+            tensor.CPU.register () -> (%2384:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.q_proj.weight][symbol:model.layers.24.self_attn.q_proj.weight])[symbol:model.layers.24.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2620:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=884), symbol:model.layers.24.self_attn.k_proj.weight])[symbol:model.layers.24.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3265:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=886), symbol:model.layers.24.self_attn.v_proj.weight])[symbol:model.layers.24.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2985:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=890), symbol:model.layers.24.self_attn.q_norm.weight])[symbol:model.layers.24.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3894:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=892), symbol:model.layers.24.self_attn.k_norm.weight])[symbol:model.layers.24.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7488:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=905), symbol:model.layers.24.self_attn.o_proj.weight])[symbol:model.layers.24.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6713:tensor<[2048], Float32, CPU>[@model.layers.24.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=908), symbol:model.layers.24.post_attention_layernorm.weight])[symbol:model.layers.24.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%1336:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=909), symbol:model.layers.24.mlp.gate_proj.weight])[symbol:model.layers.24.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%7035:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912), symbol:model.layers.24.mlp.up_proj.weight])[symbol:model.layers.24.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%7069:tensor<[2048, 6144], Float32, CPU>[@model.layers.24.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=914), symbol:model.layers.24.mlp.down_proj.weight])[symbol:model.layers.24.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%6496:tensor<[2048], Float32, CPU>[@model.layers.25.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=917), symbol:model.layers.25.input_layernorm.weight])[symbol:model.layers.25.input_layernorm.weight]
+            tensor.CPU.register () -> (%1852:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.q_proj.weight][symbol:model.layers.25.self_attn.q_proj.weight])[symbol:model.layers.25.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%3615:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=918), symbol:model.layers.25.self_attn.k_proj.weight])[symbol:model.layers.25.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2014:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=920), symbol:model.layers.25.self_attn.v_proj.weight])[symbol:model.layers.25.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2021:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=924), symbol:model.layers.25.self_attn.q_norm.weight])[symbol:model.layers.25.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1413:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=926), symbol:model.layers.25.self_attn.k_norm.weight])[symbol:model.layers.25.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7074:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939), symbol:model.layers.25.self_attn.o_proj.weight])[symbol:model.layers.25.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%6424:tensor<[2048], Float32, CPU>[@model.layers.25.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=942), symbol:model.layers.25.post_attention_layernorm.weight])[symbol:model.layers.25.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%1860:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943), symbol:model.layers.25.mlp.gate_proj.weight])[symbol:model.layers.25.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5840:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=946), symbol:model.layers.25.mlp.up_proj.weight])[symbol:model.layers.25.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6869:tensor<[2048, 6144], Float32, CPU>[@model.layers.25.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=948), symbol:model.layers.25.mlp.down_proj.weight])[symbol:model.layers.25.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%611:tensor<[2048], Float32, CPU>[@model.layers.26.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=951), symbol:model.layers.26.input_layernorm.weight])[symbol:model.layers.26.input_layernorm.weight]
+            tensor.CPU.register () -> (%1040:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.q_proj.weight][symbol:model.layers.26.self_attn.q_proj.weight])[symbol:model.layers.26.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2312:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=952), symbol:model.layers.26.self_attn.k_proj.weight])[symbol:model.layers.26.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%174:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=954), symbol:model.layers.26.self_attn.v_proj.weight])[symbol:model.layers.26.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2799:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=958), symbol:model.layers.26.self_attn.q_norm.weight])[symbol:model.layers.26.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%6479:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=960), symbol:model.layers.26.self_attn.k_norm.weight])[symbol:model.layers.26.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%504:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=973), symbol:model.layers.26.self_attn.o_proj.weight])[symbol:model.layers.26.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%5096:tensor<[2048], Float32, CPU>[@model.layers.26.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=976), symbol:model.layers.26.post_attention_layernorm.weight])[symbol:model.layers.26.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%4867:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=977), symbol:model.layers.26.mlp.gate_proj.weight])[symbol:model.layers.26.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2619:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980), symbol:model.layers.26.mlp.up_proj.weight])[symbol:model.layers.26.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%1355:tensor<[2048, 6144], Float32, CPU>[@model.layers.26.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982), symbol:model.layers.26.mlp.down_proj.weight])[symbol:model.layers.26.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%6381:tensor<[2048], Float32, CPU>[@model.layers.27.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=985), symbol:model.layers.27.input_layernorm.weight])[symbol:model.layers.27.input_layernorm.weight]
+            tensor.CPU.register () -> (%5946:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.q_proj.weight][symbol:model.layers.27.self_attn.q_proj.weight])[symbol:model.layers.27.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1802:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=986), symbol:model.layers.27.self_attn.k_proj.weight])[symbol:model.layers.27.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%6652:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=988), symbol:model.layers.27.self_attn.v_proj.weight])[symbol:model.layers.27.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6206:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=992), symbol:model.layers.27.self_attn.q_norm.weight])[symbol:model.layers.27.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1743:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=994), symbol:model.layers.27.self_attn.k_norm.weight])[symbol:model.layers.27.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%5189:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1007), symbol:model.layers.27.self_attn.o_proj.weight])[symbol:model.layers.27.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3001:tensor<[2048], Float32, CPU>[@model.layers.27.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1010), symbol:model.layers.27.post_attention_layernorm.weight])[symbol:model.layers.27.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5561:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1011), symbol:model.layers.27.mlp.gate_proj.weight])[symbol:model.layers.27.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2731:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1014), symbol:model.layers.27.mlp.up_proj.weight])[symbol:model.layers.27.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%3783:tensor<[2048, 6144], Float32, CPU>[@model.layers.27.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1016), symbol:model.layers.27.mlp.down_proj.weight])[symbol:model.layers.27.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5765:tensor<[2048], Float32, CPU>[@model.norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1019), symbol:model.norm.weight])[symbol:model.norm.weight]
+            tensor.CPU.register () -> (%6130:tensor<[151936, 2048], Float32, CPU>[@lm_head.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1020), symbol:lm_head.weight])[symbol:lm_head.weight]
         }
     }
     graph.SubGraphOp @deinit <notype> [symbol:deinit] {
@@ -319,1697 +321,1697 @@
             
         }
     }
-    graph.CallGraphOp @model (%318:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=0)], %376:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1020)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)])
+    graph.CallGraphOp @model (%8013:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8071:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
     graph.SubGraphOp @model <CPU> [using_qnn:true, symbol:model] {
-        (%318:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=0)], %376:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1020)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) {
-            linalg.CPU.EmbeddingOp <name="model.embed_tokens">(%318:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=0)]) -> (%377:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)])
-            linalg.CPU.CastTypeOp <name="model.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float32), uuid=59), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), )] (%377:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)]) -> (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)])
-            linalg.CPU.ViewOp <name="model.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int64), uuid=1), outputs_0:QuantSpec(Raw(type: Int64), uuid=1), )] (%376:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)]) -> (%376:tensor<[32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)])
-            linalg.CPU.IndexOp <name="model.Index.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=61), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), )] (%316:tensor<[1, 1024, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=61)]) -> (%379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)])
-            linalg.CPU.IndexOp <name="model.Index.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), )] (%317:tensor<[1, 1024, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)]) -> (%380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)])
-            graph.CallGraphOp @model.layers.0 (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)])
-            graph.CallGraphOp @model.layers.1 (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)])
-            graph.CallGraphOp @model.layers.2 (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)])
-            graph.CallGraphOp @model.layers.3 (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)])
-            graph.CallGraphOp @model.layers.4 (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)])
-            graph.CallGraphOp @model.layers.5 (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)])
-            graph.CallGraphOp @model.layers.6 (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)])
-            graph.CallGraphOp @model.layers.7 (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)])
-            graph.CallGraphOp @model.layers.8 (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)])
-            graph.CallGraphOp @model.layers.9 (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)])
-            graph.CallGraphOp @model.layers.10 (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)])
-            graph.CallGraphOp @model.layers.11 (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)])
-            graph.CallGraphOp @model.layers.12 (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)])
-            graph.CallGraphOp @model.layers.13 (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)])
-            graph.CallGraphOp @model.layers.14 (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)])
-            graph.CallGraphOp @model.layers.15 (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)])
-            graph.CallGraphOp @model.layers.16 (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)])
-            graph.CallGraphOp @model.layers.17 (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)])
-            graph.CallGraphOp @model.layers.18 (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)])
-            graph.CallGraphOp @model.layers.19 (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)])
-            graph.CallGraphOp @model.layers.20 (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)])
-            graph.CallGraphOp @model.layers.21 (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)])
-            graph.CallGraphOp @model.layers.22 (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)])
-            graph.CallGraphOp @model.layers.23 (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)])
-            graph.CallGraphOp @model.layers.24 (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)])
-            graph.CallGraphOp @model.layers.25 (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)])
-            graph.CallGraphOp @model.layers.26 (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)])
-            graph.CallGraphOp @model.layers.27 (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)])
-            linalg.CPU.RMSNormOp <name="model.norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), )] (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)]) -> (%1529:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
-            linalg.CPU.LinearOp <name="lm_head"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1020), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1019)), using_qnn:true] (%1529:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) -> (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1020)])
-            cf.ReturnOp (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1020)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) -> ()
+        (%8013:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8071:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) {
+            linalg.CPU.EmbeddingOp <name="model.embed_tokens">(%8013:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)]) -> (%8072:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)])
+            linalg.CPU.CastTypeOp <name="model.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float32), uuid=59), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), weight_weight:QuantSpec(Raw(type: Float32), uuid=61))] (%8072:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)]) -> (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)])
+            linalg.CPU.ViewOp <name="model.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int64), uuid=1), outputs_0:QuantSpec(Raw(type: Int64), uuid=1), )] (%8071:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)]) -> (%8071:tensor<[32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)])
+            linalg.CPU.IndexOp <name="model.Index.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), )] (%8011:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_sin][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), symbol:rope_sin]) -> (%8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)])
+            linalg.CPU.IndexOp <name="model.Index.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), )] (%8012:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_cos][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), symbol:rope_cos]) -> (%8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)])
+            graph.CallGraphOp @model.layers.0 (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)])
+            graph.CallGraphOp @model.layers.1 (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)])
+            graph.CallGraphOp @model.layers.2 (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)])
+            graph.CallGraphOp @model.layers.3 (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)])
+            graph.CallGraphOp @model.layers.4 (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)])
+            graph.CallGraphOp @model.layers.5 (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)])
+            graph.CallGraphOp @model.layers.6 (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
+            graph.CallGraphOp @model.layers.7 (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)])
+            graph.CallGraphOp @model.layers.8 (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)])
+            graph.CallGraphOp @model.layers.9 (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)])
+            graph.CallGraphOp @model.layers.10 (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)])
+            graph.CallGraphOp @model.layers.11 (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)])
+            graph.CallGraphOp @model.layers.12 (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)])
+            graph.CallGraphOp @model.layers.13 (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)])
+            graph.CallGraphOp @model.layers.14 (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
+            graph.CallGraphOp @model.layers.15 (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)])
+            graph.CallGraphOp @model.layers.16 (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)])
+            graph.CallGraphOp @model.layers.17 (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)])
+            graph.CallGraphOp @model.layers.18 (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)])
+            graph.CallGraphOp @model.layers.19 (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)])
+            graph.CallGraphOp @model.layers.20 (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)])
+            graph.CallGraphOp @model.layers.21 (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
+            graph.CallGraphOp @model.layers.22 (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)])
+            graph.CallGraphOp @model.layers.23 (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)])
+            graph.CallGraphOp @model.layers.24 (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)])
+            graph.CallGraphOp @model.layers.25 (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)])
+            graph.CallGraphOp @model.layers.26 (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)])
+            graph.CallGraphOp @model.layers.27 (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
+            linalg.CPU.RMSNormOp <name="model.norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1019))] (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) -> (%9224:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018)])
+            linalg.CPU.LinearOp <name="lm_head"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1020)), using_qnn:true] (%9224:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018)]) -> (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)])
+            cf.ReturnOp (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.0 <CPU> [using_qnn:true, symbol:model.layers.0] {
-        (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.0.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), )] (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)])
-            graph.CallGraphOp @model.layers.0.self_attn (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)])
-            linalg.CPU.AddOp <name="model.layers.0.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), )] (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)], %378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%414:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), )] (%414:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)]) -> (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
-            graph.CallGraphOp @model.layers.0.mlp (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)])
-            linalg.CPU.AddOp <name="model.layers.0.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98), )] (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)], %414:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)]) -> (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)])
-            cf.ReturnOp (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)]) -> ()
+        (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.0.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67))] (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)])
+            graph.CallGraphOp @model.layers.0.self_attn (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)])
+            linalg.CPU.AddOp <name="model.layers.0.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), )] (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%8109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92))] (%8109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)])
+            graph.CallGraphOp @model.layers.0.mlp (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
+            linalg.CPU.AddOp <name="model.layers.0.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), )] (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
+            cf.ReturnOp (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.0.self_attn <CPU> [using_qnn:true, symbol:model.layers.0.self_attn] {
-        (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)]) {
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.q_proj">(%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%382:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=71)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=67))] (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%383:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=69))] (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%384:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=71), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=71), )] (%382:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=71)]) -> (%382:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=71)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=71), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=71), )] (%382:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=71)]) -> (%385:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=71)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68), )] (%383:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68)]) -> (%383:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68), )] (%383:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68)]) -> (%386:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70), )] (%384:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70)]) -> (%384:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70), )] (%384:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70)]) -> (%387:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=71), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), )] (%385:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=71)]) -> (%388:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74), )] (%386:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=68)]) -> (%389:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74)])
-            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), )] (%388:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%390:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)])
-            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74), )] (%389:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%391:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74), outputs_0:QuantSpec(Raw(type: Float16), uuid=76), )] (%391:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=74)]) -> (%392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=76)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=76), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77), )] (%392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=76)]) -> (%393:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77), )] (%393:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)]) -> (%394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70), outputs_0:QuantSpec(Raw(type: Float16), uuid=78), )] (%387:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=70)]) -> (%395:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=78)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=78), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79), )] (%395:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=78)]) -> (%396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)])
-            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)]) -> (%397:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
-            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)]) -> (%398:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
-            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%397:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%399:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
-            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%398:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%400:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
-            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80), )] (%390:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)], %399:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%401:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80)])
-            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80), inputs_1:QuantSpec(Raw(type: Float32), uuid=81), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80), )] (%401:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80)], %402:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=81), constant:[0.088388346]]) -> (%403:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80)])
-            linalg.CPU.ReduceMinOp <name="model.layers.0.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82), )] (%403:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80)]) -> (%404:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82)])
-            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82), inputs_1:QuantSpec(Raw(type: Int16), uuid=83), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82), )] (%404:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82)], %405:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=83), constant:[-20]]) -> (%406:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82)])
-            linalg.CPU.EqualOp <name="model.layers.0.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=84), outputs_0:QuantSpec(Raw(type: UInt8), uuid=85), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %407:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=84), constant:[0]]) -> (%408:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=85)])
-            linalg.CPU.WhereOp <name="model.layers.0.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=85), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82), )] (%408:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=85)], %403:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=80)], %406:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82)]) -> (%409:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82)])
-            linalg.CPU.SoftmaxOp <name="model.layers.0.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), )] (%409:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=82)]) -> (%410:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)])
-            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), )] (%410:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)], %400:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), )] (%411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%412:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), )] (%412:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%412:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=88))] (%412:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)])
-            cf.ReturnOp (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=77)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=79)]) -> ()
+        (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) {
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.q_proj">(%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)]) -> (%8077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68))] (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)]) -> (%8078:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70))] (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)]) -> (%8079:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), )] (%8077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)]) -> (%8077:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), )] (%8077:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)]) -> (%8080:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), )] (%8078:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%8078:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), )] (%8078:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%8081:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), )] (%8079:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)]) -> (%8079:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), )] (%8079:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)]) -> (%8082:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=74))] (%8080:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)]) -> (%8083:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=76))] (%8081:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%8084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)])
+            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), )] (%8083:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)])
+            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), )] (%8084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), outputs_0:QuantSpec(Raw(type: Float16), uuid=77), )] (%8086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)]) -> (%8087:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=77)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=77), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), )] (%8087:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=77)]) -> (%8088:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), )] (%8088:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)]) -> (%8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), outputs_0:QuantSpec(Raw(type: Float16), uuid=79), )] (%8082:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)]) -> (%8090:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=79)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=79), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80), )] (%8090:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=79)]) -> (%8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)]) -> (%8092:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) -> (%8093:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
+            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%8092:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8094:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
+            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%8093:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8095:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
+            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), )] (%8085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)], %8094:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8096:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), inputs_1:QuantSpec(Raw(type: Float32), uuid=82), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), )] (%8096:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)], %8097:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=82), constant:[0.088388346]]) -> (%8098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)])
+            linalg.CPU.ReduceMinOp <name="model.layers.0.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%8098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)]) -> (%8099:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), inputs_1:QuantSpec(Raw(type: Int16), uuid=84), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%8099:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)], %8100:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=84), constant:[-20]]) -> (%8101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
+            linalg.CPU.EqualOp <name="model.layers.0.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=85), outputs_0:QuantSpec(Raw(type: UInt8), uuid=86), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8102:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=85), constant:[0]]) -> (%8103:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=86)])
+            linalg.CPU.WhereOp <name="model.layers.0.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=86), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%8103:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=86)], %8098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)], %8101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)]) -> (%8104:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
+            linalg.CPU.SoftmaxOp <name="model.layers.0.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), )] (%8104:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)]) -> (%8105:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)])
+            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), )] (%8105:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)], %8095:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8106:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), )] (%8106:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)]) -> (%8107:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), )] (%8107:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)]) -> (%8107:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=89))] (%8107:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)]) -> (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
+            cf.ReturnOp (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.0.mlp <CPU> [using_qnn:true, symbol:model.layers.0.mlp] {
-        (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)]) {
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=93), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=92))] (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%416:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=93)])
-            linalg.CPU.SiLUOp <name="model.layers.0.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=93), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), )] (%416:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=93)]) -> (%417:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=96), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=95))] (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%418:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=96)])
-            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=96), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), )] (%417:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %418:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=96)]) -> (%419:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=97))] (%419:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) -> (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)])
-            cf.ReturnOp (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)]) -> ()
+        (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) {
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=93))] (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8111:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
+            linalg.CPU.SiLUOp <name="model.layers.0.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), )] (%8111:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) -> (%8112:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96))] (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)])
+            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), )] (%8112:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)], %8113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)]) -> (%8114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=98))] (%8114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)]) -> (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
+            cf.ReturnOp (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.1 <CPU> [using_qnn:true, symbol:model.layers.1] {
-        (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.1.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), )] (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)]) -> (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            graph.CallGraphOp @model.layers.1.self_attn (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)])
-            linalg.CPU.AddOp <name="model.layers.1.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123), )] (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)], %421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=98)]) -> (%455:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), )] (%455:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)]) -> (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
-            graph.CallGraphOp @model.layers.1.mlp (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)])
-            linalg.CPU.AddOp <name="model.layers.1.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), )] (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %455:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)]) -> (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)])
-            cf.ReturnOp (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)]) -> ()
+        (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.1.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101))] (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)])
+            graph.CallGraphOp @model.layers.1.self_attn (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)])
+            linalg.CPU.AddOp <name="model.layers.1.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), )] (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%8150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=126))] (%8150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)])
+            graph.CallGraphOp @model.layers.1.mlp (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
+            linalg.CPU.AddOp <name="model.layers.1.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), )] (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
+            cf.ReturnOp (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.1.self_attn <CPU> [using_qnn:true, symbol:model.layers.1.self_attn] {
-        (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)]) {
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.q_proj">(%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%423:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=105)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=101))] (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%424:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=103))] (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%425:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=105), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=105), )] (%423:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=105)]) -> (%423:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=105)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=105), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=105), )] (%423:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=105)]) -> (%426:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=105)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), )] (%424:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)]) -> (%424:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), )] (%424:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)]) -> (%427:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104), )] (%425:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104)]) -> (%425:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104), )] (%425:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104)]) -> (%428:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=105), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106), )] (%426:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=105)]) -> (%429:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108), )] (%427:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)]) -> (%430:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108)])
-            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106), )] (%429:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%431:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106)])
-            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108), )] (%430:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%432:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108), outputs_0:QuantSpec(Raw(type: Float16), uuid=110), )] (%432:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=108)]) -> (%433:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=110)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=110), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111), )] (%433:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=110)]) -> (%434:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111), )] (%434:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)]) -> (%435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104), outputs_0:QuantSpec(Raw(type: Float16), uuid=112), )] (%428:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=104)]) -> (%436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=112)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=112), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113), )] (%436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=112)]) -> (%437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)])
-            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)]) -> (%438:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
-            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)]) -> (%439:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
-            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%438:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%440:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
-            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%439:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%441:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
-            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), )] (%431:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=106)], %440:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%442:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)])
-            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), inputs_1:QuantSpec(Raw(type: Float32), uuid=115), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), )] (%442:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)], %443:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=115), constant:[0.088388346]]) -> (%444:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)])
-            linalg.CPU.ReduceMinOp <name="model.layers.1.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), )] (%444:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)]) -> (%445:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)])
-            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), inputs_1:QuantSpec(Raw(type: Int16), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), )] (%445:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)], %446:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=117), constant:[-20]]) -> (%447:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)])
-            linalg.CPU.EqualOp <name="model.layers.1.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=118), outputs_0:QuantSpec(Raw(type: UInt8), uuid=119), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %448:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=118), constant:[0]]) -> (%449:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=119)])
-            linalg.CPU.WhereOp <name="model.layers.1.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=119), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), )] (%449:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=119)], %444:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)], %447:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)]) -> (%450:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)])
-            linalg.CPU.SoftmaxOp <name="model.layers.1.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120), )] (%450:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)]) -> (%451:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120)])
-            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), )] (%451:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120)], %441:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), )] (%452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)]) -> (%453:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), )] (%453:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)]) -> (%453:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=122))] (%453:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)]) -> (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)])
-            cf.ReturnOp (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=123)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=111)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=113)]) -> ()
+        (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) {
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.q_proj">(%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)]) -> (%8118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=102))] (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)]) -> (%8119:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=104))] (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)]) -> (%8120:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), )] (%8118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)]) -> (%8118:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), )] (%8118:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)]) -> (%8121:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), )] (%8119:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)]) -> (%8119:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), )] (%8119:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)]) -> (%8122:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), )] (%8120:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)]) -> (%8120:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), )] (%8120:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)]) -> (%8123:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=108))] (%8121:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)]) -> (%8124:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=110))] (%8122:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)]) -> (%8125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
+            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), )] (%8124:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)])
+            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), )] (%8125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), outputs_0:QuantSpec(Raw(type: Float16), uuid=111), )] (%8127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)]) -> (%8128:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=111)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=111), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), )] (%8128:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=111)]) -> (%8129:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), )] (%8129:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)]) -> (%8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), outputs_0:QuantSpec(Raw(type: Float16), uuid=113), )] (%8123:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)]) -> (%8131:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=113), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114), )] (%8131:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)]) -> (%8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)]) -> (%8133:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) -> (%8134:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
+            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%8133:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8135:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
+            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%8134:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8136:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
+            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), )] (%8126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)], %8135:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8137:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), inputs_1:QuantSpec(Raw(type: Float32), uuid=116), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), )] (%8137:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)], %8138:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=116), constant:[0.088388346]]) -> (%8139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)])
+            linalg.CPU.ReduceMinOp <name="model.layers.1.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%8139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)]) -> (%8140:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), inputs_1:QuantSpec(Raw(type: Int16), uuid=118), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%8140:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)], %8141:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=118), constant:[-20]]) -> (%8142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
+            linalg.CPU.EqualOp <name="model.layers.1.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=119), outputs_0:QuantSpec(Raw(type: UInt8), uuid=120), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8143:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=119), constant:[0]]) -> (%8144:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=120)])
+            linalg.CPU.WhereOp <name="model.layers.1.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=120), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%8144:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=120)], %8139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)], %8142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%8145:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
+            linalg.CPU.SoftmaxOp <name="model.layers.1.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), )] (%8145:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%8146:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)])
+            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), )] (%8146:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)], %8136:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8147:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), )] (%8147:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%8148:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), )] (%8148:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%8148:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=123))] (%8148:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
+            cf.ReturnOp (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.1.mlp <CPU> [using_qnn:true, symbol:model.layers.1.mlp] {
-        (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)]) {
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=126))] (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%457:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)])
-            linalg.CPU.SiLUOp <name="model.layers.1.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), )] (%457:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)]) -> (%458:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)])
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=130), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=129))] (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%459:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=130)])
-            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=130), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), )] (%458:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)], %459:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=130)]) -> (%460:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)])
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=131))] (%460:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)]) -> (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)])
-            cf.ReturnOp (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)]) -> ()
+        (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) {
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=127))] (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8152:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)])
+            linalg.CPU.SiLUOp <name="model.layers.1.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), )] (%8152:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)]) -> (%8153:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=130))] (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)])
+            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), )] (%8153:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)], %8154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)]) -> (%8155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=132))] (%8155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)]) -> (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
+            cf.ReturnOp (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.2 <CPU> [using_qnn:true, symbol:model.layers.2] {
-        (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.2.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), )] (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)]) -> (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
-            graph.CallGraphOp @model.layers.2.self_attn (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)])
-            linalg.CPU.AddOp <name="model.layers.2.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), )] (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)], %462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)]) -> (%496:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), )] (%496:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)]) -> (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)])
-            graph.CallGraphOp @model.layers.2.mlp (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)])
-            linalg.CPU.AddOp <name="model.layers.2.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166), )] (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)], %496:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)]) -> (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)])
-            cf.ReturnOp (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)]) -> ()
+        (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.2.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=135))] (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)])
+            graph.CallGraphOp @model.layers.2.self_attn (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)])
+            linalg.CPU.AddOp <name="model.layers.2.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), )] (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%8191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=160))] (%8191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)])
+            graph.CallGraphOp @model.layers.2.mlp (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
+            linalg.CPU.AddOp <name="model.layers.2.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), )] (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
+            cf.ReturnOp (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.2.self_attn <CPU> [using_qnn:true, symbol:model.layers.2.self_attn] {
-        (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)]) {
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.q_proj">(%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%464:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=139)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=135))] (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%465:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=137))] (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%466:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=139), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=139), )] (%464:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=139)]) -> (%464:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=139)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=139), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=139), )] (%464:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=139)]) -> (%467:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=139)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136), )] (%465:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136)]) -> (%465:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136), )] (%465:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136)]) -> (%468:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138), )] (%466:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138)]) -> (%466:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138), )] (%466:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138)]) -> (%469:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140), )] (%467:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=139)]) -> (%470:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142), )] (%468:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=136)]) -> (%471:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142)])
-            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140), )] (%470:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%472:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140)])
-            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142), )] (%471:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%473:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142), outputs_0:QuantSpec(Raw(type: Float16), uuid=144), )] (%473:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=142)]) -> (%474:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=144)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=144), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145), )] (%474:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=144)]) -> (%475:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145), )] (%475:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)]) -> (%476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138), outputs_0:QuantSpec(Raw(type: Float16), uuid=146), )] (%469:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=138)]) -> (%477:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=146)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147), )] (%477:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=146)]) -> (%478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)])
-            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)]) -> (%479:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
-            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)]) -> (%480:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
-            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%479:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%481:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
-            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%480:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%482:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
-            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148), )] (%472:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=140)], %481:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%483:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148)])
-            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148), inputs_1:QuantSpec(Raw(type: Float32), uuid=149), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148), )] (%483:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148)], %484:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=149), constant:[0.088388346]]) -> (%485:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148)])
-            linalg.CPU.ReduceMinOp <name="model.layers.2.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), )] (%485:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148)]) -> (%486:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)])
-            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), inputs_1:QuantSpec(Raw(type: Int16), uuid=151), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), )] (%486:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)], %487:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=151), constant:[-20]]) -> (%488:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)])
-            linalg.CPU.EqualOp <name="model.layers.2.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=152), outputs_0:QuantSpec(Raw(type: UInt8), uuid=153), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %489:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=152), constant:[0]]) -> (%490:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=153)])
-            linalg.CPU.WhereOp <name="model.layers.2.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=153), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), )] (%490:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=153)], %485:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=148)], %488:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)]) -> (%491:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)])
-            linalg.CPU.SoftmaxOp <name="model.layers.2.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), )] (%491:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)]) -> (%492:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)])
-            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), )] (%492:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %482:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), )] (%493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)]) -> (%494:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), )] (%494:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)]) -> (%494:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=156))] (%494:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)]) -> (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)])
-            cf.ReturnOp (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=145)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=147)]) -> ()
+        (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) {
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.q_proj">(%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)]) -> (%8159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=136))] (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)]) -> (%8160:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138))] (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)]) -> (%8161:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), )] (%8159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)]) -> (%8159:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), )] (%8159:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)]) -> (%8162:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), )] (%8160:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%8160:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), )] (%8160:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%8163:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%8161:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%8161:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%8161:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%8164:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=142))] (%8162:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)]) -> (%8165:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144))] (%8163:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%8166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)])
+            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), )] (%8165:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)])
+            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), )] (%8166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), outputs_0:QuantSpec(Raw(type: Float16), uuid=145), )] (%8168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)]) -> (%8169:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=145)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=145), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), )] (%8169:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=145)]) -> (%8170:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), )] (%8170:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)]) -> (%8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(Raw(type: Float16), uuid=147), )] (%8164:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%8172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=147)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=147), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148), )] (%8172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=147)]) -> (%8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)]) -> (%8174:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) -> (%8175:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
+            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%8174:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8176:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
+            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%8175:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8177:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
+            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), )] (%8167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)], %8176:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8178:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), inputs_1:QuantSpec(Raw(type: Float32), uuid=150), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), )] (%8178:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)], %8179:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=150), constant:[0.088388346]]) -> (%8180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)])
+            linalg.CPU.ReduceMinOp <name="model.layers.2.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), )] (%8180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)]) -> (%8181:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), inputs_1:QuantSpec(Raw(type: Int16), uuid=152), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), )] (%8181:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)], %8182:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=152), constant:[-20]]) -> (%8183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)])
+            linalg.CPU.EqualOp <name="model.layers.2.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=153), outputs_0:QuantSpec(Raw(type: UInt8), uuid=154), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8184:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=153), constant:[0]]) -> (%8185:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=154)])
+            linalg.CPU.WhereOp <name="model.layers.2.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=154), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), )] (%8185:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=154)], %8180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)], %8183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)]) -> (%8186:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)])
+            linalg.CPU.SoftmaxOp <name="model.layers.2.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), )] (%8186:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)]) -> (%8187:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)])
+            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), )] (%8187:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)], %8177:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8188:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), )] (%8188:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)]) -> (%8189:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), )] (%8189:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)]) -> (%8189:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=157))] (%8189:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)]) -> (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)])
+            cf.ReturnOp (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.2.mlp <CPU> [using_qnn:true, symbol:model.layers.2.mlp] {
-        (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)]) {
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=160))] (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%498:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161)])
-            linalg.CPU.SiLUOp <name="model.layers.2.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), )] (%498:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161)]) -> (%499:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)])
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=164), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=163))] (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%500:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=164)])
-            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=164), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), )] (%499:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)], %500:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=164)]) -> (%501:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)])
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=165))] (%501:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)]) -> (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)])
-            cf.ReturnOp (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)]) -> ()
+        (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) {
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=161))] (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8193:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)])
+            linalg.CPU.SiLUOp <name="model.layers.2.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), )] (%8193:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)]) -> (%8194:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164))] (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165)])
+            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), )] (%8194:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)], %8195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165)]) -> (%8196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166))] (%8196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)]) -> (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
+            cf.ReturnOp (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.3 <CPU> [using_qnn:true, symbol:model.layers.3] {
-        (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.3.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), )] (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)]) -> (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
-            graph.CallGraphOp @model.layers.3.self_attn (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)])
-            linalg.CPU.AddOp <name="model.layers.3.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), )] (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)], %503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=166)]) -> (%537:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), )] (%537:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)]) -> (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
-            graph.CallGraphOp @model.layers.3.mlp (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)])
-            linalg.CPU.AddOp <name="model.layers.3.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200), )] (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)], %537:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)]) -> (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)])
-            cf.ReturnOp (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)]) -> ()
+        (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.3.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169))] (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)])
+            graph.CallGraphOp @model.layers.3.self_attn (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)])
+            linalg.CPU.AddOp <name="model.layers.3.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), )] (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%8232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=194))] (%8232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)])
+            graph.CallGraphOp @model.layers.3.mlp (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
+            linalg.CPU.AddOp <name="model.layers.3.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), )] (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
+            cf.ReturnOp (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.3.self_attn <CPU> [using_qnn:true, symbol:model.layers.3.self_attn] {
-        (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)]) {
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.q_proj">(%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%505:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=173)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=169))] (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%506:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=171))] (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%507:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=173), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=173), )] (%505:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=173)]) -> (%505:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=173)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=173), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=173), )] (%505:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=173)]) -> (%508:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=173)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170), )] (%506:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170)]) -> (%506:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170), )] (%506:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170)]) -> (%509:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172), )] (%507:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172)]) -> (%507:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172), )] (%507:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172)]) -> (%510:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=173), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), )] (%508:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=173)]) -> (%511:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), )] (%509:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=170)]) -> (%512:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)])
-            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), )] (%511:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%513:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)])
-            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), )] (%512:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%514:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), outputs_0:QuantSpec(Raw(type: Float16), uuid=178), )] (%514:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)]) -> (%515:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=178)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=178), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179), )] (%515:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=178)]) -> (%516:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179), )] (%516:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)]) -> (%517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172), outputs_0:QuantSpec(Raw(type: Float16), uuid=180), )] (%510:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=172)]) -> (%518:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=180)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=180), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181), )] (%518:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=180)]) -> (%519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)])
-            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)]) -> (%520:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
-            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)]) -> (%521:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
-            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%520:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%522:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
-            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%521:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%523:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
-            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), )] (%513:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)], %522:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%524:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)])
-            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), inputs_1:QuantSpec(Raw(type: Float32), uuid=183), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), )] (%524:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)], %525:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=183), constant:[0.088388346]]) -> (%526:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)])
-            linalg.CPU.ReduceMinOp <name="model.layers.3.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), )] (%526:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)]) -> (%527:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)])
-            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), inputs_1:QuantSpec(Raw(type: Int16), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), )] (%527:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %528:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=185), constant:[-20]]) -> (%529:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)])
-            linalg.CPU.EqualOp <name="model.layers.3.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=186), outputs_0:QuantSpec(Raw(type: UInt8), uuid=187), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %530:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=186), constant:[0]]) -> (%531:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=187)])
-            linalg.CPU.WhereOp <name="model.layers.3.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=187), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), )] (%531:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=187)], %526:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)], %529:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)]) -> (%532:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)])
-            linalg.CPU.SoftmaxOp <name="model.layers.3.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=188), )] (%532:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)]) -> (%533:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=188)])
-            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=188), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%533:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=188)], %523:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)]) -> (%535:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%535:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)]) -> (%535:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=190))] (%535:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)]) -> (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)])
-            cf.ReturnOp (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=179)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=181)]) -> ()
+        (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) {
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.q_proj">(%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)]) -> (%8200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=170))] (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)]) -> (%8201:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=172))] (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)]) -> (%8202:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), )] (%8200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)]) -> (%8200:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), )] (%8200:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)]) -> (%8203:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), )] (%8201:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)]) -> (%8201:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), )] (%8201:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)]) -> (%8204:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), )] (%8202:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)]) -> (%8202:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), )] (%8202:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)]) -> (%8205:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=176))] (%8203:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)]) -> (%8206:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=178))] (%8204:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)]) -> (%8207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)])
+            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), )] (%8206:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)])
+            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), )] (%8207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), outputs_0:QuantSpec(Raw(type: Float16), uuid=179), )] (%8209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)]) -> (%8210:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=179)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=179), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), )] (%8210:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=179)]) -> (%8211:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), )] (%8211:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)]) -> (%8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), outputs_0:QuantSpec(Raw(type: Float16), uuid=181), )] (%8205:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)]) -> (%8213:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=181)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=181), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182), )] (%8213:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=181)]) -> (%8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)]) -> (%8215:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) -> (%8216:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
+            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%8215:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8217:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
+            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%8216:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8218:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
+            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), )] (%8208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)], %8217:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8219:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), inputs_1:QuantSpec(Raw(type: Float32), uuid=184), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), )] (%8219:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)], %8220:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=184), constant:[0.088388346]]) -> (%8221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)])
+            linalg.CPU.ReduceMinOp <name="model.layers.3.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%8221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)]) -> (%8222:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), inputs_1:QuantSpec(Raw(type: Int16), uuid=186), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%8222:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)], %8223:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=186), constant:[-20]]) -> (%8224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
+            linalg.CPU.EqualOp <name="model.layers.3.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=187), outputs_0:QuantSpec(Raw(type: UInt8), uuid=188), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8225:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=187), constant:[0]]) -> (%8226:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=188)])
+            linalg.CPU.WhereOp <name="model.layers.3.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=188), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%8226:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=188)], %8221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)], %8224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%8227:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
+            linalg.CPU.SoftmaxOp <name="model.layers.3.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%8227:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%8228:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
+            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), )] (%8228:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)], %8218:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8229:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), )] (%8229:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)]) -> (%8230:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), )] (%8230:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)]) -> (%8230:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=191))] (%8230:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)]) -> (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
+            cf.ReturnOp (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.3.mlp <CPU> [using_qnn:true, symbol:model.layers.3.mlp] {
-        (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)]) {
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=195), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=194))] (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%539:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=195)])
-            linalg.CPU.SiLUOp <name="model.layers.3.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=195), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), )] (%539:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=195)]) -> (%540:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)])
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=198), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=197))] (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%541:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=198)])
-            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=198), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), )] (%540:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)], %541:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=198)]) -> (%542:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)])
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=199))] (%542:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)]) -> (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)])
-            cf.ReturnOp (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)]) -> ()
+        (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) {
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195))] (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8234:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)])
+            linalg.CPU.SiLUOp <name="model.layers.3.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), )] (%8234:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)]) -> (%8235:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=198))] (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)])
+            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), )] (%8235:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)], %8236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)]) -> (%8237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=200))] (%8237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)]) -> (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
+            cf.ReturnOp (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.4 <CPU> [using_qnn:true, symbol:model.layers.4] {
-        (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.4.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), )] (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)]) -> (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
-            graph.CallGraphOp @model.layers.4.self_attn (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)])
-            linalg.CPU.AddOp <name="model.layers.4.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225), )] (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)], %544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=200)]) -> (%578:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), )] (%578:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)]) -> (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)])
-            graph.CallGraphOp @model.layers.4.mlp (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)])
-            linalg.CPU.AddOp <name="model.layers.4.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), )] (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)], %578:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)]) -> (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)])
-            cf.ReturnOp (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)]) -> ()
+        (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.4.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203))] (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)])
+            graph.CallGraphOp @model.layers.4.self_attn (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)])
+            linalg.CPU.AddOp <name="model.layers.4.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), )] (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%8273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=228))] (%8273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)])
+            graph.CallGraphOp @model.layers.4.mlp (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
+            linalg.CPU.AddOp <name="model.layers.4.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), )] (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
+            cf.ReturnOp (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.4.self_attn <CPU> [using_qnn:true, symbol:model.layers.4.self_attn] {
-        (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)]) {
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.q_proj">(%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%546:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=207)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=203))] (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%547:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=205))] (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%548:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=207), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=207), )] (%546:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=207)]) -> (%546:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=207)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=207), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=207), )] (%546:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=207)]) -> (%549:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=207)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), )] (%547:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)]) -> (%547:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), )] (%547:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)]) -> (%550:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), )] (%548:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)]) -> (%548:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), )] (%548:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)]) -> (%551:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208), )] (%549:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=207)]) -> (%552:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), )] (%550:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)]) -> (%553:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)])
-            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208), )] (%552:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%554:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208)])
-            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), )] (%553:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%555:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), outputs_0:QuantSpec(Raw(type: Float16), uuid=212), )] (%555:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)]) -> (%556:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=212)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=212), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213), )] (%556:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=212)]) -> (%557:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213), )] (%557:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)]) -> (%558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), outputs_0:QuantSpec(Raw(type: Float16), uuid=214), )] (%551:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)]) -> (%559:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=214)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215), )] (%559:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=214)]) -> (%560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)])
-            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)]) -> (%561:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
-            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)]) -> (%562:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
-            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%561:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%563:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
-            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%562:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%564:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
-            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216), )] (%554:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=208)], %563:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%565:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216)])
-            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216), inputs_1:QuantSpec(Raw(type: Float32), uuid=217), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216), )] (%565:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216)], %566:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=217), constant:[0.088388346]]) -> (%567:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216)])
-            linalg.CPU.ReduceMinOp <name="model.layers.4.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218), )] (%567:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216)]) -> (%568:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218)])
-            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218), inputs_1:QuantSpec(Raw(type: Int16), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218), )] (%568:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218)], %569:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=219), constant:[-20]]) -> (%570:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218)])
-            linalg.CPU.EqualOp <name="model.layers.4.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=220), outputs_0:QuantSpec(Raw(type: UInt8), uuid=221), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %571:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=220), constant:[0]]) -> (%572:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=221)])
-            linalg.CPU.WhereOp <name="model.layers.4.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=221), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218), )] (%572:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=221)], %567:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=216)], %570:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218)]) -> (%573:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218)])
-            linalg.CPU.SoftmaxOp <name="model.layers.4.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222), )] (%573:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=218)]) -> (%574:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222)])
-            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), )] (%574:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222)], %564:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), )] (%575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)]) -> (%576:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), )] (%576:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)]) -> (%576:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=224))] (%576:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)]) -> (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)])
-            cf.ReturnOp (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=225)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=213)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=215)]) -> ()
+        (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) {
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.q_proj">(%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)]) -> (%8241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204))] (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)]) -> (%8242:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=206))] (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)]) -> (%8243:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), )] (%8241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)]) -> (%8241:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), )] (%8241:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)]) -> (%8244:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), )] (%8242:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)]) -> (%8242:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), )] (%8242:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)]) -> (%8245:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), )] (%8243:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%8243:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), )] (%8243:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%8246:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=210))] (%8244:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)]) -> (%8247:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212))] (%8245:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)]) -> (%8248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)])
+            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), )] (%8247:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)])
+            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), )] (%8248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), outputs_0:QuantSpec(Raw(type: Float16), uuid=213), )] (%8250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)]) -> (%8251:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=213)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=213), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), )] (%8251:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=213)]) -> (%8252:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), )] (%8252:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)]) -> (%8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(Raw(type: Float16), uuid=215), )] (%8246:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%8254:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=215)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=215), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216), )] (%8254:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=215)]) -> (%8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)]) -> (%8256:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) -> (%8257:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
+            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%8256:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8258:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
+            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%8257:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8259:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
+            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), )] (%8249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)], %8258:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8260:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), inputs_1:QuantSpec(Raw(type: Float32), uuid=218), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), )] (%8260:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)], %8261:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=218), constant:[0.088388346]]) -> (%8262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
+            linalg.CPU.ReduceMinOp <name="model.layers.4.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%8262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)]) -> (%8263:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), inputs_1:QuantSpec(Raw(type: Int16), uuid=220), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%8263:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)], %8264:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=220), constant:[-20]]) -> (%8265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
+            linalg.CPU.EqualOp <name="model.layers.4.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=221), outputs_0:QuantSpec(Raw(type: UInt8), uuid=222), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8266:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=221), constant:[0]]) -> (%8267:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=222)])
+            linalg.CPU.WhereOp <name="model.layers.4.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=222), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%8267:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=222)], %8262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)], %8265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%8268:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
+            linalg.CPU.SoftmaxOp <name="model.layers.4.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), )] (%8268:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%8269:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)])
+            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), )] (%8269:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)], %8259:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8270:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), )] (%8270:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)]) -> (%8271:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), )] (%8271:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)]) -> (%8271:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=225))] (%8271:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)]) -> (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)])
+            cf.ReturnOp (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.4.mlp <CPU> [using_qnn:true, symbol:model.layers.4.mlp] {
-        (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) {
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=228))] (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%580:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)])
-            linalg.CPU.SiLUOp <name="model.layers.4.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), )] (%580:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)]) -> (%581:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)])
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=232), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=231))] (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%582:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=232)])
-            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=232), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), )] (%581:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)], %582:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=232)]) -> (%583:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)])
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=233))] (%583:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)]) -> (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)])
-            cf.ReturnOp (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) -> ()
+        (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) {
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=229))] (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8275:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)])
+            linalg.CPU.SiLUOp <name="model.layers.4.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), )] (%8275:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)]) -> (%8276:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=232))] (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233)])
+            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), )] (%8276:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)], %8277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233)]) -> (%8278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234))] (%8278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)]) -> (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
+            cf.ReturnOp (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.5 <CPU> [using_qnn:true, symbol:model.layers.5] {
-        (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.5.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), )] (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) -> (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
-            graph.CallGraphOp @model.layers.5.self_attn (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)])
-            linalg.CPU.AddOp <name="model.layers.5.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), )] (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)], %585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) -> (%619:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), )] (%619:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)]) -> (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)])
-            graph.CallGraphOp @model.layers.5.mlp (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)])
-            linalg.CPU.AddOp <name="model.layers.5.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268), )] (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)], %619:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)]) -> (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)])
-            cf.ReturnOp (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)]) -> ()
+        (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.5.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237))] (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)])
+            graph.CallGraphOp @model.layers.5.self_attn (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)])
+            linalg.CPU.AddOp <name="model.layers.5.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), )] (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%8314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=262))] (%8314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)])
+            graph.CallGraphOp @model.layers.5.mlp (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
+            linalg.CPU.AddOp <name="model.layers.5.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), )] (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
+            cf.ReturnOp (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.5.self_attn <CPU> [using_qnn:true, symbol:model.layers.5.self_attn] {
-        (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)]) {
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.q_proj">(%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%587:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=241)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=237))] (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%588:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=239))] (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%589:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=241), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=241), )] (%587:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=241)]) -> (%587:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=241)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=241), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=241), )] (%587:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=241)]) -> (%590:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=241)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238), )] (%588:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238)]) -> (%588:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238), )] (%588:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238)]) -> (%591:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), )] (%589:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)]) -> (%589:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), )] (%589:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)]) -> (%592:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=241), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242), )] (%590:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=241)]) -> (%593:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), )] (%591:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=238)]) -> (%594:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)])
-            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242), )] (%593:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%595:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242)])
-            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), )] (%594:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%596:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), outputs_0:QuantSpec(Raw(type: Float16), uuid=246), )] (%596:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)]) -> (%597:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=246)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=246), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247), )] (%597:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=246)]) -> (%598:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247), )] (%598:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)]) -> (%599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), outputs_0:QuantSpec(Raw(type: Float16), uuid=248), )] (%592:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)]) -> (%600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=248)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=248), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249), )] (%600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=248)]) -> (%601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)])
-            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)]) -> (%602:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
-            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)]) -> (%603:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
-            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%602:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%604:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
-            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%603:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%605:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
-            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250), )] (%595:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242)], %604:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%606:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250)])
-            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250), inputs_1:QuantSpec(Raw(type: Float32), uuid=251), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250), )] (%606:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250)], %607:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=251), constant:[0.088388346]]) -> (%608:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250)])
-            linalg.CPU.ReduceMinOp <name="model.layers.5.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), )] (%608:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250)]) -> (%609:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)])
-            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), inputs_1:QuantSpec(Raw(type: Int16), uuid=253), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), )] (%609:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)], %610:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=253), constant:[-20]]) -> (%611:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)])
-            linalg.CPU.EqualOp <name="model.layers.5.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=254), outputs_0:QuantSpec(Raw(type: UInt8), uuid=255), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %612:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=254), constant:[0]]) -> (%613:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=255)])
-            linalg.CPU.WhereOp <name="model.layers.5.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=255), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), )] (%613:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=255)], %608:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=250)], %611:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)]) -> (%614:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)])
-            linalg.CPU.SoftmaxOp <name="model.layers.5.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=256), )] (%614:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)]) -> (%615:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=256)])
-            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=256), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%615:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=256)], %605:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)]) -> (%617:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%617:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)]) -> (%617:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=258))] (%617:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)]) -> (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)])
-            cf.ReturnOp (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=247)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=249)]) -> ()
+        (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) {
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.q_proj">(%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%8282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=238))] (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%8283:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=240))] (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%8284:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), )] (%8282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)]) -> (%8282:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), )] (%8282:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)]) -> (%8285:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), )] (%8283:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%8283:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), )] (%8283:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%8286:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), )] (%8284:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)]) -> (%8284:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), )] (%8284:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)]) -> (%8287:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=244))] (%8285:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)]) -> (%8288:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=246))] (%8286:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%8289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)])
+            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), )] (%8288:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)])
+            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), )] (%8289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), outputs_0:QuantSpec(Raw(type: Float16), uuid=247), )] (%8291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)]) -> (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=247)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), )] (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=247)]) -> (%8293:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), )] (%8293:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)]) -> (%8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), outputs_0:QuantSpec(Raw(type: Float16), uuid=249), )] (%8287:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)]) -> (%8295:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=249)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=249), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250), )] (%8295:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=249)]) -> (%8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)]) -> (%8297:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) -> (%8298:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
+            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%8297:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8299:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
+            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%8298:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8300:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
+            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), )] (%8290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)], %8299:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8301:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), inputs_1:QuantSpec(Raw(type: Float32), uuid=252), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), )] (%8301:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)], %8302:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=252), constant:[0.088388346]]) -> (%8303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)])
+            linalg.CPU.ReduceMinOp <name="model.layers.5.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), )] (%8303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)]) -> (%8304:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), inputs_1:QuantSpec(Raw(type: Int16), uuid=254), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), )] (%8304:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)], %8305:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=254), constant:[-20]]) -> (%8306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)])
+            linalg.CPU.EqualOp <name="model.layers.5.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=255), outputs_0:QuantSpec(Raw(type: UInt8), uuid=256), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8307:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=255), constant:[0]]) -> (%8308:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=256)])
+            linalg.CPU.WhereOp <name="model.layers.5.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=256), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), )] (%8308:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=256)], %8303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)], %8306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)]) -> (%8309:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)])
+            linalg.CPU.SoftmaxOp <name="model.layers.5.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%8309:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)]) -> (%8310:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
+            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), )] (%8310:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)], %8300:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8311:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), )] (%8311:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)]) -> (%8312:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), )] (%8312:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)]) -> (%8312:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=259))] (%8312:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)]) -> (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)])
+            cf.ReturnOp (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.5.mlp <CPU> [using_qnn:true, symbol:model.layers.5.mlp] {
-        (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)]) {
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=262))] (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%621:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263)])
-            linalg.CPU.SiLUOp <name="model.layers.5.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), )] (%621:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263)]) -> (%622:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265))] (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%623:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)])
-            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), )] (%622:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)], %623:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)]) -> (%624:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=267))] (%624:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)]) -> (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)])
-            cf.ReturnOp (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)]) -> ()
+        (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) {
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=263))] (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8316:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
+            linalg.CPU.SiLUOp <name="model.layers.5.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), )] (%8316:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)]) -> (%8317:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=266))] (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)])
+            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), )] (%8317:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)], %8318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)]) -> (%8319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=268))] (%8319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)]) -> (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
+            cf.ReturnOp (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.6 <CPU> [using_qnn:true, symbol:model.layers.6] {
-        (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.6.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), )] (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)]) -> (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
-            graph.CallGraphOp @model.layers.6.self_attn (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)])
-            linalg.CPU.AddOp <name="model.layers.6.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), )] (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)], %626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=268)]) -> (%660:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), )] (%660:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)]) -> (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
-            graph.CallGraphOp @model.layers.6.mlp (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)])
-            linalg.CPU.AddOp <name="model.layers.6.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), )] (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)], %660:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)]) -> (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)])
-            cf.ReturnOp (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)]) -> ()
+        (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.6.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=271))] (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)])
+            graph.CallGraphOp @model.layers.6.self_attn (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
+            linalg.CPU.AddOp <name="model.layers.6.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), )] (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%8355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296))] (%8355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)])
+            graph.CallGraphOp @model.layers.6.mlp (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
+            linalg.CPU.AddOp <name="model.layers.6.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), )] (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
+            cf.ReturnOp (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.6.self_attn <CPU> [using_qnn:true, symbol:model.layers.6.self_attn] {
-        (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)]) {
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.q_proj">(%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%628:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=275)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271))] (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%629:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=273))] (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%630:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=275), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=275), )] (%628:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=275)]) -> (%628:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=275)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=275), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=275), )] (%628:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=275)]) -> (%631:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=275)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), )] (%629:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)]) -> (%629:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), )] (%629:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)]) -> (%632:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), )] (%630:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) -> (%630:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), )] (%630:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) -> (%633:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276), )] (%631:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=275)]) -> (%634:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278), )] (%632:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)]) -> (%635:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278)])
-            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276), )] (%634:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%636:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276)])
-            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278), )] (%635:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%637:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278), outputs_0:QuantSpec(Raw(type: Float16), uuid=280), )] (%637:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=278)]) -> (%638:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=280)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=280), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281), )] (%638:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=280)]) -> (%639:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281), )] (%639:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)]) -> (%640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), outputs_0:QuantSpec(Raw(type: Float16), uuid=282), )] (%633:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) -> (%641:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=282)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=282), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283), )] (%641:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=282)]) -> (%642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)])
-            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)]) -> (%643:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
-            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)]) -> (%644:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
-            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%643:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%645:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
-            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%644:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%646:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
-            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284), )] (%636:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=276)], %645:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%647:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284)])
-            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284), inputs_1:QuantSpec(Raw(type: Float32), uuid=285), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284), )] (%647:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284)], %648:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=285), constant:[0.088388346]]) -> (%649:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284)])
-            linalg.CPU.ReduceMinOp <name="model.layers.6.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286), )] (%649:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284)]) -> (%650:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286)])
-            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286), inputs_1:QuantSpec(Raw(type: Int16), uuid=287), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286), )] (%650:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286)], %651:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=287), constant:[-20]]) -> (%652:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286)])
-            linalg.CPU.EqualOp <name="model.layers.6.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=288), outputs_0:QuantSpec(Raw(type: UInt8), uuid=289), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %653:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=288), constant:[0]]) -> (%654:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=289)])
-            linalg.CPU.WhereOp <name="model.layers.6.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=289), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286), )] (%654:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=289)], %649:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=284)], %652:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286)]) -> (%655:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286)])
-            linalg.CPU.SoftmaxOp <name="model.layers.6.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=290), )] (%655:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=286)]) -> (%656:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=290)])
-            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=290), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), )] (%656:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=290)], %646:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), )] (%657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)]) -> (%658:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), )] (%658:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)]) -> (%658:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=292))] (%658:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)]) -> (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)])
-            cf.ReturnOp (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=281)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=283)]) -> ()
+        (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) {
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.q_proj">(%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%8323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=272))] (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%8324:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274))] (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%8325:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), )] (%8323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)]) -> (%8323:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), )] (%8323:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)]) -> (%8326:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), )] (%8324:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)]) -> (%8324:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), )] (%8324:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)]) -> (%8327:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), )] (%8325:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%8325:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), )] (%8325:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%8328:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=278))] (%8326:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)]) -> (%8329:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=280))] (%8327:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)]) -> (%8330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
+            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), )] (%8329:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
+            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), )] (%8330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), outputs_0:QuantSpec(Raw(type: Float16), uuid=281), )] (%8332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)]) -> (%8333:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=281)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=281), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), )] (%8333:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=281)]) -> (%8334:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), )] (%8334:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)]) -> (%8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(Raw(type: Float16), uuid=283), )] (%8328:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%8336:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=283)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=283), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), )] (%8336:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=283)]) -> (%8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)]) -> (%8338:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> (%8339:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
+            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%8338:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8340:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
+            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%8339:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8341:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
+            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), )] (%8331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)], %8340:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8342:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), inputs_1:QuantSpec(Raw(type: Float32), uuid=286), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), )] (%8342:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)], %8343:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=286), constant:[0.088388346]]) -> (%8344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)])
+            linalg.CPU.ReduceMinOp <name="model.layers.6.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%8344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)]) -> (%8345:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), inputs_1:QuantSpec(Raw(type: Int16), uuid=288), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%8345:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)], %8346:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=288), constant:[-20]]) -> (%8347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
+            linalg.CPU.EqualOp <name="model.layers.6.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=289), outputs_0:QuantSpec(Raw(type: UInt8), uuid=290), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8348:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=289), constant:[0]]) -> (%8349:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=290)])
+            linalg.CPU.WhereOp <name="model.layers.6.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=290), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%8349:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=290)], %8344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)], %8347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)]) -> (%8350:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
+            linalg.CPU.SoftmaxOp <name="model.layers.6.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), )] (%8350:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)]) -> (%8351:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)])
+            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), )] (%8351:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)], %8341:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8352:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), )] (%8352:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)]) -> (%8353:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), )] (%8353:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)]) -> (%8353:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=293))] (%8353:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)]) -> (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
+            cf.ReturnOp (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.6.mlp <CPU> [using_qnn:true, symbol:model.layers.6.mlp] {
-        (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)]) {
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=296))] (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%662:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)])
-            linalg.CPU.SiLUOp <name="model.layers.6.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), )] (%662:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)]) -> (%663:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)])
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=299))] (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%664:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300)])
-            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), )] (%663:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)], %664:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300)]) -> (%665:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)])
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=301))] (%665:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)]) -> (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)])
-            cf.ReturnOp (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)]) -> ()
+        (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) {
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=297))] (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8357:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)])
+            linalg.CPU.SiLUOp <name="model.layers.6.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), )] (%8357:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)]) -> (%8358:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300))] (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301)])
+            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), )] (%8358:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)], %8359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301)]) -> (%8360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=302))] (%8360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)]) -> (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
+            cf.ReturnOp (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.7 <CPU> [using_qnn:true, symbol:model.layers.7] {
-        (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.7.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), )] (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)]) -> (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
-            graph.CallGraphOp @model.layers.7.self_attn (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)])
-            linalg.CPU.AddOp <name="model.layers.7.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), )] (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)], %667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)]) -> (%701:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), )] (%701:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)]) -> (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)])
-            graph.CallGraphOp @model.layers.7.mlp (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)])
-            linalg.CPU.AddOp <name="model.layers.7.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336), )] (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)], %701:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)]) -> (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)])
-            cf.ReturnOp (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)]) -> ()
+        (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.7.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305))] (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)])
+            graph.CallGraphOp @model.layers.7.self_attn (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)])
+            linalg.CPU.AddOp <name="model.layers.7.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), )] (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%8396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330))] (%8396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)])
+            graph.CallGraphOp @model.layers.7.mlp (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
+            linalg.CPU.AddOp <name="model.layers.7.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), )] (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
+            cf.ReturnOp (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.7.self_attn <CPU> [using_qnn:true, symbol:model.layers.7.self_attn] {
-        (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)]) {
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.q_proj">(%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%669:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=309)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=305))] (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%670:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=307))] (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%671:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=309), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=309), )] (%669:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=309)]) -> (%669:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=309)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=309), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=309), )] (%669:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=309)]) -> (%672:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=309)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306), )] (%670:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306)]) -> (%670:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306), )] (%670:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306)]) -> (%673:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308), )] (%671:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308)]) -> (%671:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308), )] (%671:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308)]) -> (%674:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310), )] (%672:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=309)]) -> (%675:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), )] (%673:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=306)]) -> (%676:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)])
-            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310), )] (%675:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%677:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310)])
-            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), )] (%676:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%678:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), outputs_0:QuantSpec(Raw(type: Float16), uuid=314), )] (%678:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)]) -> (%679:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=314)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=314), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315), )] (%679:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=314)]) -> (%680:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315), )] (%680:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)]) -> (%681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308), outputs_0:QuantSpec(Raw(type: Float16), uuid=316), )] (%674:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=308)]) -> (%682:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=316)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=316), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317), )] (%682:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=316)]) -> (%683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)])
-            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)]) -> (%684:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
-            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)]) -> (%685:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
-            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%684:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%686:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
-            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%685:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%687:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
-            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318), )] (%677:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=310)], %686:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%688:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318)])
-            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318), inputs_1:QuantSpec(Raw(type: Float32), uuid=319), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318), )] (%688:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318)], %689:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=319), constant:[0.088388346]]) -> (%690:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318)])
-            linalg.CPU.ReduceMinOp <name="model.layers.7.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320), )] (%690:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318)]) -> (%691:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320)])
-            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320), inputs_1:QuantSpec(Raw(type: Int16), uuid=321), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320), )] (%691:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320)], %692:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=321), constant:[-20]]) -> (%693:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320)])
-            linalg.CPU.EqualOp <name="model.layers.7.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=322), outputs_0:QuantSpec(Raw(type: UInt8), uuid=323), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %694:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=322), constant:[0]]) -> (%695:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=323)])
-            linalg.CPU.WhereOp <name="model.layers.7.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=323), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320), )] (%695:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=323)], %690:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=318)], %693:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320)]) -> (%696:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320)])
-            linalg.CPU.SoftmaxOp <name="model.layers.7.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), )] (%696:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=320)]) -> (%697:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)])
-            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), )] (%697:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)], %687:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), )] (%698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)]) -> (%699:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), )] (%699:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)]) -> (%699:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=326))] (%699:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)]) -> (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)])
-            cf.ReturnOp (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=315)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=317)]) -> ()
+        (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) {
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.q_proj">(%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%8364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306))] (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%8365:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=308))] (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%8366:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), )] (%8364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%8364:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), )] (%8364:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%8367:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), )] (%8365:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%8365:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), )] (%8365:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%8368:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), )] (%8366:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%8366:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), )] (%8366:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%8369:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=312))] (%8367:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%8370:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=314))] (%8368:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%8371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)])
+            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), )] (%8370:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)])
+            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), )] (%8371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), outputs_0:QuantSpec(Raw(type: Float16), uuid=315), )] (%8373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)]) -> (%8374:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=315)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=315), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), )] (%8374:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=315)]) -> (%8375:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), )] (%8375:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> (%8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(Raw(type: Float16), uuid=317), )] (%8369:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%8377:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=317)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=317), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318), )] (%8377:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=317)]) -> (%8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> (%8379:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) -> (%8380:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
+            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%8379:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8381:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
+            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%8380:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8382:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
+            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%8372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)], %8381:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8383:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), inputs_1:QuantSpec(Raw(type: Float32), uuid=320), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%8383:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)], %8384:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=320), constant:[0.088388346]]) -> (%8385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
+            linalg.CPU.ReduceMinOp <name="model.layers.7.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), )] (%8385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)]) -> (%8386:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), inputs_1:QuantSpec(Raw(type: Int16), uuid=322), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), )] (%8386:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)], %8387:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=322), constant:[-20]]) -> (%8388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)])
+            linalg.CPU.EqualOp <name="model.layers.7.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=323), outputs_0:QuantSpec(Raw(type: UInt8), uuid=324), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8389:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=323), constant:[0]]) -> (%8390:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=324)])
+            linalg.CPU.WhereOp <name="model.layers.7.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=324), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), )] (%8390:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=324)], %8385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)], %8388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)]) -> (%8391:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)])
+            linalg.CPU.SoftmaxOp <name="model.layers.7.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), )] (%8391:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)]) -> (%8392:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)])
+            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%8392:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)], %8382:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8393:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%8393:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%8394:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%8394:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%8394:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=327))] (%8394:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)])
+            cf.ReturnOp (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.7.mlp <CPU> [using_qnn:true, symbol:model.layers.7.mlp] {
-        (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)]) {
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=331), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=330))] (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%703:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=331)])
-            linalg.CPU.SiLUOp <name="model.layers.7.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=331), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), )] (%703:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=331)]) -> (%704:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)])
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=333))] (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%705:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)])
-            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), )] (%704:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)], %705:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)]) -> (%706:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)])
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=335))] (%706:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)]) -> (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)])
-            cf.ReturnOp (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)]) -> ()
+        (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) {
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=331))] (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8398:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)])
+            linalg.CPU.SiLUOp <name="model.layers.7.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), )] (%8398:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)]) -> (%8399:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=334))] (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)])
+            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), )] (%8399:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)], %8400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)]) -> (%8401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=336))] (%8401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)]) -> (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
+            cf.ReturnOp (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.8 <CPU> [using_qnn:true, symbol:model.layers.8] {
-        (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.8.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), )] (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)]) -> (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            graph.CallGraphOp @model.layers.8.self_attn (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)])
-            linalg.CPU.AddOp <name="model.layers.8.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361), )] (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)], %708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=336)]) -> (%742:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), )] (%742:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)]) -> (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)])
-            graph.CallGraphOp @model.layers.8.mlp (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)])
-            linalg.CPU.AddOp <name="model.layers.8.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370), )] (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)], %742:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)]) -> (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)])
-            cf.ReturnOp (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)]) -> ()
+        (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.8.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339))] (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)])
+            graph.CallGraphOp @model.layers.8.self_attn (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)])
+            linalg.CPU.AddOp <name="model.layers.8.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), )] (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%8437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364))] (%8437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)])
+            graph.CallGraphOp @model.layers.8.mlp (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
+            linalg.CPU.AddOp <name="model.layers.8.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), )] (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
+            cf.ReturnOp (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.8.self_attn <CPU> [using_qnn:true, symbol:model.layers.8.self_attn] {
-        (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)]) {
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.q_proj">(%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%710:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=343)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=339))] (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%711:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=341))] (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%712:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=343), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=343), )] (%710:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=343)]) -> (%710:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=343)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=343), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=343), )] (%710:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=343)]) -> (%713:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=343)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340), )] (%711:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340)]) -> (%711:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340), )] (%711:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340)]) -> (%714:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), )] (%712:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)]) -> (%712:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), )] (%712:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)]) -> (%715:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=343), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344), )] (%713:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=343)]) -> (%716:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346), )] (%714:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=340)]) -> (%717:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346)])
-            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344), )] (%716:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%718:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344)])
-            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346), )] (%717:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%719:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346), outputs_0:QuantSpec(Raw(type: Float16), uuid=348), )] (%719:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=346)]) -> (%720:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=348)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=348), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349), )] (%720:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=348)]) -> (%721:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349), )] (%721:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)]) -> (%722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), outputs_0:QuantSpec(Raw(type: Float16), uuid=350), )] (%715:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)]) -> (%723:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=350)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=350), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351), )] (%723:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=350)]) -> (%724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)])
-            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)]) -> (%725:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
-            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)]) -> (%726:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
-            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%725:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%727:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
-            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%726:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%728:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
-            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352), )] (%718:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=344)], %727:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%729:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352)])
-            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352), inputs_1:QuantSpec(Raw(type: Float32), uuid=353), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352), )] (%729:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352)], %730:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=353), constant:[0.088388346]]) -> (%731:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352)])
-            linalg.CPU.ReduceMinOp <name="model.layers.8.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), )] (%731:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352)]) -> (%732:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)])
-            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), inputs_1:QuantSpec(Raw(type: Int16), uuid=355), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), )] (%732:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)], %733:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=355), constant:[-20]]) -> (%734:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)])
-            linalg.CPU.EqualOp <name="model.layers.8.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=356), outputs_0:QuantSpec(Raw(type: UInt8), uuid=357), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %735:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=356), constant:[1]]) -> (%736:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=357)])
-            linalg.CPU.WhereOp <name="model.layers.8.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=357), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), )] (%736:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=357)], %731:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=352)], %734:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)]) -> (%737:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)])
-            linalg.CPU.SoftmaxOp <name="model.layers.8.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=358), )] (%737:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)]) -> (%738:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=358)])
-            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=358), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), )] (%738:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=358)], %728:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), )] (%739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)]) -> (%740:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), )] (%740:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)]) -> (%740:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=360))] (%740:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)]) -> (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)])
-            cf.ReturnOp (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=361)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=349)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=351)]) -> ()
+        (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) {
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.q_proj">(%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)]) -> (%8405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=340))] (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)]) -> (%8406:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=342))] (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)]) -> (%8407:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), )] (%8405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)]) -> (%8405:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), )] (%8405:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)]) -> (%8408:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), )] (%8406:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)]) -> (%8406:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), )] (%8406:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)]) -> (%8409:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), )] (%8407:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)]) -> (%8407:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), )] (%8407:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)]) -> (%8410:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=346))] (%8408:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)]) -> (%8411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=348))] (%8409:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)]) -> (%8412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)])
+            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), )] (%8411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)])
+            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), )] (%8412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), outputs_0:QuantSpec(Raw(type: Float16), uuid=349), )] (%8414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)]) -> (%8415:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=349)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=349), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), )] (%8415:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=349)]) -> (%8416:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), )] (%8416:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)]) -> (%8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), outputs_0:QuantSpec(Raw(type: Float16), uuid=351), )] (%8410:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)]) -> (%8418:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=351)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=351), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352), )] (%8418:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=351)]) -> (%8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)]) -> (%8420:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) -> (%8421:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
+            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%8420:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8422:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
+            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%8421:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8423:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
+            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), )] (%8413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)], %8422:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8424:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), inputs_1:QuantSpec(Raw(type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), )] (%8424:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)], %8425:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=354), constant:[0.088388346]]) -> (%8426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)])
+            linalg.CPU.ReduceMinOp <name="model.layers.8.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), )] (%8426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)]) -> (%8427:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), inputs_1:QuantSpec(Raw(type: Int16), uuid=356), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), )] (%8427:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)], %8428:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=356), constant:[-20]]) -> (%8429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)])
+            linalg.CPU.EqualOp <name="model.layers.8.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=357), outputs_0:QuantSpec(Raw(type: UInt8), uuid=358), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8430:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=357), constant:[1]]) -> (%8431:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=358)])
+            linalg.CPU.WhereOp <name="model.layers.8.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=358), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), )] (%8431:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=358)], %8426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)], %8429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)]) -> (%8432:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)])
+            linalg.CPU.SoftmaxOp <name="model.layers.8.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), )] (%8432:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)]) -> (%8433:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)])
+            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%8433:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)], %8423:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8434:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%8434:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%8435:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%8435:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%8435:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=361))] (%8435:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)])
+            cf.ReturnOp (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.8.mlp <CPU> [using_qnn:true, symbol:model.layers.8.mlp] {
-        (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)]) {
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=364))] (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%744:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)])
-            linalg.CPU.SiLUOp <name="model.layers.8.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), )] (%744:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)]) -> (%745:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)])
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=368), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=367))] (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%746:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=368)])
-            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=368), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), )] (%745:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)], %746:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=368)]) -> (%747:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)])
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=369))] (%747:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)]) -> (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)])
-            cf.ReturnOp (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)]) -> ()
+        (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) {
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=365))] (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8439:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)])
+            linalg.CPU.SiLUOp <name="model.layers.8.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), )] (%8439:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)]) -> (%8440:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=368))] (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)])
+            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), )] (%8440:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)], %8441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)]) -> (%8442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370))] (%8442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)]) -> (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
+            cf.ReturnOp (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.9 <CPU> [using_qnn:true, symbol:model.layers.9] {
-        (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.9.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), )] (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)]) -> (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
-            graph.CallGraphOp @model.layers.9.self_attn (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)])
-            linalg.CPU.AddOp <name="model.layers.9.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), )] (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)], %749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=370)]) -> (%783:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), )] (%783:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)]) -> (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)])
-            graph.CallGraphOp @model.layers.9.mlp (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)])
-            linalg.CPU.AddOp <name="model.layers.9.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404), )] (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)], %783:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)]) -> (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)])
-            cf.ReturnOp (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)]) -> ()
+        (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.9.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=373))] (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)])
+            graph.CallGraphOp @model.layers.9.self_attn (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)])
+            linalg.CPU.AddOp <name="model.layers.9.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), )] (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%8478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=398))] (%8478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)])
+            graph.CallGraphOp @model.layers.9.mlp (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
+            linalg.CPU.AddOp <name="model.layers.9.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), )] (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
+            cf.ReturnOp (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.9.self_attn <CPU> [using_qnn:true, symbol:model.layers.9.self_attn] {
-        (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)]) {
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.q_proj">(%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%751:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=377)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=373))] (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%752:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=375))] (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%753:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=377), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=377), )] (%751:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=377)]) -> (%751:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=377)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=377), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=377), )] (%751:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=377)]) -> (%754:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=377)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374), )] (%752:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374)]) -> (%752:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374), )] (%752:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374)]) -> (%755:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376), )] (%753:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376)]) -> (%753:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376), )] (%753:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376)]) -> (%756:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=377), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378), )] (%754:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=377)]) -> (%757:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380), )] (%755:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=374)]) -> (%758:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380)])
-            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378), )] (%757:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%759:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378)])
-            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380), )] (%758:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%760:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380), outputs_0:QuantSpec(Raw(type: Float16), uuid=382), )] (%760:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=380)]) -> (%761:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=382)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=382), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383), )] (%761:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=382)]) -> (%762:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383), )] (%762:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)]) -> (%763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376), outputs_0:QuantSpec(Raw(type: Float16), uuid=384), )] (%756:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=376)]) -> (%764:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=384)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385), )] (%764:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=384)]) -> (%765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)])
-            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)]) -> (%766:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
-            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)]) -> (%767:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
-            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%766:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%768:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
-            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%767:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%769:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
-            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), )] (%759:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=378)], %768:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%770:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)])
-            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), inputs_1:QuantSpec(Raw(type: Float32), uuid=387), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), )] (%770:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)], %771:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=387), constant:[0.088388346]]) -> (%772:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)])
-            linalg.CPU.ReduceMinOp <name="model.layers.9.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388), )] (%772:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)]) -> (%773:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388)])
-            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388), inputs_1:QuantSpec(Raw(type: Int16), uuid=389), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388), )] (%773:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388)], %774:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=389), constant:[-20]]) -> (%775:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388)])
-            linalg.CPU.EqualOp <name="model.layers.9.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=390), outputs_0:QuantSpec(Raw(type: UInt8), uuid=391), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %776:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=390), constant:[-0.1796875]]) -> (%777:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=391)])
-            linalg.CPU.WhereOp <name="model.layers.9.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=391), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388), )] (%777:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=391)], %772:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)], %775:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388)]) -> (%778:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388)])
-            linalg.CPU.SoftmaxOp <name="model.layers.9.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392), )] (%778:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=388)]) -> (%779:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392)])
-            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), )] (%779:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392)], %769:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), )] (%780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)]) -> (%781:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), )] (%781:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)]) -> (%781:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=394))] (%781:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)]) -> (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)])
-            cf.ReturnOp (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=383)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=385)]) -> ()
+        (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) {
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.q_proj">(%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%8446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374))] (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%8447:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376))] (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%8448:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), )] (%8446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)]) -> (%8446:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), )] (%8446:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)]) -> (%8449:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), )] (%8447:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)]) -> (%8447:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), )] (%8447:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)]) -> (%8450:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), )] (%8448:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%8448:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), )] (%8448:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%8451:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=380))] (%8449:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)]) -> (%8452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=382))] (%8450:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)]) -> (%8453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)])
+            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), )] (%8452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
+            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), )] (%8453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), outputs_0:QuantSpec(Raw(type: Float16), uuid=383), )] (%8455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)]) -> (%8456:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=383)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=383), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), )] (%8456:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=383)]) -> (%8457:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), )] (%8457:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)]) -> (%8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(Raw(type: Float16), uuid=385), )] (%8451:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%8459:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=385)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=385), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386), )] (%8459:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=385)]) -> (%8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)]) -> (%8461:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) -> (%8462:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
+            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%8461:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8463:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
+            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%8462:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8464:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
+            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), )] (%8454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)], %8463:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8465:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), inputs_1:QuantSpec(Raw(type: Float32), uuid=388), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), )] (%8465:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)], %8466:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=388), constant:[0.088388346]]) -> (%8467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)])
+            linalg.CPU.ReduceMinOp <name="model.layers.9.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), )] (%8467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)]) -> (%8468:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), inputs_1:QuantSpec(Raw(type: Int16), uuid=390), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), )] (%8468:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)], %8469:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=390), constant:[-20]]) -> (%8470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
+            linalg.CPU.EqualOp <name="model.layers.9.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=391), outputs_0:QuantSpec(Raw(type: UInt8), uuid=392), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8471:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=391), constant:[-0.1796875]]) -> (%8472:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=392)])
+            linalg.CPU.WhereOp <name="model.layers.9.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=392), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), )] (%8472:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=392)], %8467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)], %8470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)]) -> (%8473:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
+            linalg.CPU.SoftmaxOp <name="model.layers.9.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), )] (%8473:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)]) -> (%8474:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)])
+            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%8474:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)], %8464:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8475:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%8475:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%8476:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%8476:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%8476:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=395))] (%8476:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)])
+            cf.ReturnOp (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.9.mlp <CPU> [using_qnn:true, symbol:model.layers.9.mlp] {
-        (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)]) {
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=398))] (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%785:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)])
-            linalg.CPU.SiLUOp <name="model.layers.9.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), )] (%785:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)]) -> (%786:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)])
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=401))] (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%787:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402)])
-            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), )] (%786:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)], %787:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402)]) -> (%788:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)])
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=403))] (%788:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)]) -> (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)])
-            cf.ReturnOp (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)]) -> ()
+        (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) {
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=399))] (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8480:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)])
+            linalg.CPU.SiLUOp <name="model.layers.9.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), )] (%8480:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)]) -> (%8481:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=402))] (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403)])
+            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), )] (%8481:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)], %8482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403)]) -> (%8483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=404))] (%8483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)]) -> (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
+            cf.ReturnOp (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.10 <CPU> [using_qnn:true, symbol:model.layers.10] {
-        (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.10.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), )] (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)]) -> (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
-            graph.CallGraphOp @model.layers.10.self_attn (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)])
-            linalg.CPU.AddOp <name="model.layers.10.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), )] (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)], %790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=404)]) -> (%824:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), )] (%824:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)]) -> (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)])
-            graph.CallGraphOp @model.layers.10.mlp (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)])
-            linalg.CPU.AddOp <name="model.layers.10.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438), )] (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)], %824:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)]) -> (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)])
-            cf.ReturnOp (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)]) -> ()
+        (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.10.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407))] (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)])
+            graph.CallGraphOp @model.layers.10.self_attn (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)])
+            linalg.CPU.AddOp <name="model.layers.10.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), )] (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%8519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432))] (%8519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)])
+            graph.CallGraphOp @model.layers.10.mlp (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
+            linalg.CPU.AddOp <name="model.layers.10.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), )] (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
+            cf.ReturnOp (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.10.self_attn <CPU> [using_qnn:true, symbol:model.layers.10.self_attn] {
-        (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)]) {
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.q_proj">(%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%792:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=411)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=407))] (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%793:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=409))] (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%794:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=411), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=411), )] (%792:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=411)]) -> (%792:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=411)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=411), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=411), )] (%792:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=411)]) -> (%795:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=411)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408), )] (%793:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408)]) -> (%793:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408), )] (%793:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408)]) -> (%796:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410), )] (%794:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410)]) -> (%794:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410), )] (%794:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410)]) -> (%797:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=411), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412), )] (%795:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=411)]) -> (%798:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), )] (%796:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=408)]) -> (%799:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)])
-            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412), )] (%798:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%800:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412)])
-            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), )] (%799:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%801:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), outputs_0:QuantSpec(Raw(type: Float16), uuid=416), )] (%801:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)]) -> (%802:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=416)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=416), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417), )] (%802:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=416)]) -> (%803:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417), )] (%803:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)]) -> (%804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410), outputs_0:QuantSpec(Raw(type: Float16), uuid=418), )] (%797:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=410)]) -> (%805:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=418)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=418), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419), )] (%805:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=418)]) -> (%806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)])
-            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)]) -> (%807:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
-            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)]) -> (%808:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
-            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%807:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%809:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
-            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%808:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%810:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
-            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), )] (%800:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=412)], %809:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%811:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)])
-            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), inputs_1:QuantSpec(Raw(type: Float32), uuid=421), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), )] (%811:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)], %812:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=421), constant:[0.088388346]]) -> (%813:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)])
-            linalg.CPU.ReduceMinOp <name="model.layers.10.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), )] (%813:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)]) -> (%814:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)])
-            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), inputs_1:QuantSpec(Raw(type: Int16), uuid=423), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), )] (%814:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)], %815:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=423), constant:[-20]]) -> (%816:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)])
-            linalg.CPU.EqualOp <name="model.layers.10.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=424), outputs_0:QuantSpec(Raw(type: UInt8), uuid=425), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %817:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=424), constant:[-0.93359375]]) -> (%818:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=425)])
-            linalg.CPU.WhereOp <name="model.layers.10.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=425), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), )] (%818:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=425)], %813:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)], %816:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)]) -> (%819:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)])
-            linalg.CPU.SoftmaxOp <name="model.layers.10.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=426), )] (%819:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)]) -> (%820:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=426)])
-            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=426), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%820:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=426)], %810:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)]) -> (%822:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%822:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)]) -> (%822:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=428))] (%822:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)]) -> (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)])
-            cf.ReturnOp (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=417)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=419)]) -> ()
+        (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) {
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.q_proj">(%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)]) -> (%8487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=408))] (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)]) -> (%8488:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=410))] (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)]) -> (%8489:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), )] (%8487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)]) -> (%8487:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), )] (%8487:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)]) -> (%8490:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%8488:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%8488:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%8488:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%8491:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), )] (%8489:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)]) -> (%8489:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), )] (%8489:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)]) -> (%8492:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=414))] (%8490:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)]) -> (%8493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416))] (%8491:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%8494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)])
+            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), )] (%8493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)])
+            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), )] (%8494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), outputs_0:QuantSpec(Raw(type: Float16), uuid=417), )] (%8496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)]) -> (%8497:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=417)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=417), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), )] (%8497:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=417)]) -> (%8498:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), )] (%8498:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)]) -> (%8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), outputs_0:QuantSpec(Raw(type: Float16), uuid=419), )] (%8492:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)]) -> (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=419)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=419), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420), )] (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=419)]) -> (%8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)]) -> (%8502:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) -> (%8503:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
+            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%8502:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8504:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
+            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%8503:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8505:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
+            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), )] (%8495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)], %8504:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8506:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), inputs_1:QuantSpec(Raw(type: Float32), uuid=422), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), )] (%8506:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)], %8507:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=422), constant:[0.088388346]]) -> (%8508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)])
+            linalg.CPU.ReduceMinOp <name="model.layers.10.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), )] (%8508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)]) -> (%8509:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), inputs_1:QuantSpec(Raw(type: Int16), uuid=424), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), )] (%8509:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)], %8510:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=424), constant:[-20]]) -> (%8511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)])
+            linalg.CPU.EqualOp <name="model.layers.10.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=425), outputs_0:QuantSpec(Raw(type: UInt8), uuid=426), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8512:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=425), constant:[-0.93359375]]) -> (%8513:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=426)])
+            linalg.CPU.WhereOp <name="model.layers.10.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=426), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), )] (%8513:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=426)], %8508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)], %8511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)]) -> (%8514:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)])
+            linalg.CPU.SoftmaxOp <name="model.layers.10.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%8514:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)]) -> (%8515:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
+            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), )] (%8515:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)], %8505:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8516:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), )] (%8516:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)]) -> (%8517:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), )] (%8517:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)]) -> (%8517:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=429))] (%8517:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)]) -> (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)])
+            cf.ReturnOp (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.10.mlp <CPU> [using_qnn:true, symbol:model.layers.10.mlp] {
-        (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)]) {
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=433), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=432))] (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%826:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=433)])
-            linalg.CPU.SiLUOp <name="model.layers.10.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=433), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), )] (%826:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=433)]) -> (%827:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)])
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=436), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=435))] (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%828:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=436)])
-            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=436), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), )] (%827:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)], %828:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=436)]) -> (%829:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)])
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=437))] (%829:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)]) -> (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)])
-            cf.ReturnOp (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)]) -> ()
+        (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) {
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=433))] (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8521:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)])
+            linalg.CPU.SiLUOp <name="model.layers.10.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), )] (%8521:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)]) -> (%8522:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=436))] (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)])
+            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), )] (%8522:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)], %8523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)]) -> (%8524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=438))] (%8524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)]) -> (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
+            cf.ReturnOp (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.11 <CPU> [using_qnn:true, symbol:model.layers.11] {
-        (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.11.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), )] (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)]) -> (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            graph.CallGraphOp @model.layers.11.self_attn (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)])
-            linalg.CPU.AddOp <name="model.layers.11.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463), )] (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)], %831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=438)]) -> (%865:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), )] (%865:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)]) -> (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)])
-            graph.CallGraphOp @model.layers.11.mlp (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)])
-            linalg.CPU.AddOp <name="model.layers.11.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472), )] (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)], %865:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)]) -> (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)])
-            cf.ReturnOp (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)]) -> ()
+        (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.11.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=441))] (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)])
+            graph.CallGraphOp @model.layers.11.self_attn (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)])
+            linalg.CPU.AddOp <name="model.layers.11.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), )] (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%8560:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=466))] (%8560:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)])
+            graph.CallGraphOp @model.layers.11.mlp (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
+            linalg.CPU.AddOp <name="model.layers.11.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), )] (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8560:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
+            cf.ReturnOp (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.11.self_attn <CPU> [using_qnn:true, symbol:model.layers.11.self_attn] {
-        (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)]) {
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.q_proj">(%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%833:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=445)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=441))] (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%834:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=443))] (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%835:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=445), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=445), )] (%833:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=445)]) -> (%833:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=445)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=445), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=445), )] (%833:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=445)]) -> (%836:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=445)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442), )] (%834:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442)]) -> (%834:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442), )] (%834:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442)]) -> (%837:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), )] (%835:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)]) -> (%835:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), )] (%835:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)]) -> (%838:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=445), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), )] (%836:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=445)]) -> (%839:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448), )] (%837:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=442)]) -> (%840:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448)])
-            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), )] (%839:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%841:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)])
-            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448), )] (%840:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%842:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448), outputs_0:QuantSpec(Raw(type: Float16), uuid=450), )] (%842:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=448)]) -> (%843:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=450)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=450), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451), )] (%843:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=450)]) -> (%844:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451), )] (%844:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)]) -> (%845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), outputs_0:QuantSpec(Raw(type: Float16), uuid=452), )] (%838:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)]) -> (%846:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=452)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=452), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453), )] (%846:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=452)]) -> (%847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)])
-            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)]) -> (%848:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
-            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)]) -> (%849:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
-            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%848:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%850:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
-            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%849:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%851:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
-            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), )] (%841:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)], %850:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%852:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)])
-            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), inputs_1:QuantSpec(Raw(type: Float32), uuid=455), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), )] (%852:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %853:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=455), constant:[0.088388346]]) -> (%854:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)])
-            linalg.CPU.ReduceMinOp <name="model.layers.11.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456), )] (%854:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)]) -> (%855:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456)])
-            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456), inputs_1:QuantSpec(Raw(type: Int16), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456), )] (%855:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456)], %856:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=457), constant:[-20]]) -> (%857:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456)])
-            linalg.CPU.EqualOp <name="model.layers.11.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=458), outputs_0:QuantSpec(Raw(type: UInt8), uuid=459), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %858:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=458), constant:[0.515625]]) -> (%859:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=459)])
-            linalg.CPU.WhereOp <name="model.layers.11.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=459), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456), )] (%859:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=459)], %854:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %857:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456)]) -> (%860:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456)])
-            linalg.CPU.SoftmaxOp <name="model.layers.11.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=460), )] (%860:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=456)]) -> (%861:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=460)])
-            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=460), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%861:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=460)], %851:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)]) -> (%863:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%863:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)]) -> (%863:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=462))] (%863:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)]) -> (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)])
-            cf.ReturnOp (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=463)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=451)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=453)]) -> ()
+        (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) {
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.q_proj">(%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)]) -> (%8528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=442))] (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)]) -> (%8529:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444))] (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)]) -> (%8530:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), )] (%8528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)]) -> (%8528:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), )] (%8528:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)]) -> (%8531:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), )] (%8529:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)]) -> (%8529:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), )] (%8529:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)]) -> (%8532:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), )] (%8530:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)]) -> (%8530:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), )] (%8530:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)]) -> (%8533:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=448))] (%8531:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)]) -> (%8534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450))] (%8532:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)]) -> (%8535:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)])
+            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), )] (%8534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8536:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)])
+            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), )] (%8535:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8537:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), outputs_0:QuantSpec(Raw(type: Float16), uuid=451), )] (%8537:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)]) -> (%8538:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=451)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=451), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), )] (%8538:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=451)]) -> (%8539:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), )] (%8539:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)]) -> (%8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), outputs_0:QuantSpec(Raw(type: Float16), uuid=453), )] (%8533:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)]) -> (%8541:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=453)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=453), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454), )] (%8541:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=453)]) -> (%8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)]) -> (%8543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) -> (%8544:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
+            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%8543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
+            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%8544:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8546:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
+            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), )] (%8536:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)], %8545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8547:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), inputs_1:QuantSpec(Raw(type: Float32), uuid=456), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), )] (%8547:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)], %8548:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=456), constant:[0.088388346]]) -> (%8549:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)])
+            linalg.CPU.ReduceMinOp <name="model.layers.11.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%8549:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)]) -> (%8550:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), inputs_1:QuantSpec(Raw(type: Int16), uuid=458), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%8550:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)], %8551:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=458), constant:[-20]]) -> (%8552:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
+            linalg.CPU.EqualOp <name="model.layers.11.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=459), outputs_0:QuantSpec(Raw(type: UInt8), uuid=460), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8553:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=459), constant:[0.515625]]) -> (%8554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=460)])
+            linalg.CPU.WhereOp <name="model.layers.11.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=460), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%8554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=460)], %8549:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)], %8552:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%8555:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
+            linalg.CPU.SoftmaxOp <name="model.layers.11.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%8555:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%8556:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
+            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%8556:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)], %8546:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8557:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%8557:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%8558:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%8558:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%8558:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=463))] (%8558:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)])
+            cf.ReturnOp (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.11.mlp <CPU> [using_qnn:true, symbol:model.layers.11.mlp] {
-        (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)]) {
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=466))] (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%867:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)])
-            linalg.CPU.SiLUOp <name="model.layers.11.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), )] (%867:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)]) -> (%868:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)])
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=470), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=469))] (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%869:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=470)])
-            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=470), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), )] (%868:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)], %869:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=470)]) -> (%870:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)])
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=471))] (%870:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)]) -> (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)])
-            cf.ReturnOp (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)]) -> ()
+        (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) {
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=467))] (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8562:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)])
+            linalg.CPU.SiLUOp <name="model.layers.11.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%8562:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)]) -> (%8563:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=470))] (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8564:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471)])
+            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%8563:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)], %8564:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471)]) -> (%8565:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=472))] (%8565:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)]) -> (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
+            cf.ReturnOp (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.12 <CPU> [using_qnn:true, symbol:model.layers.12] {
-        (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.12.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), )] (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)]) -> (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
-            graph.CallGraphOp @model.layers.12.self_attn (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)])
-            linalg.CPU.AddOp <name="model.layers.12.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), )] (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)], %872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=472)]) -> (%906:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), )] (%906:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)]) -> (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)])
-            graph.CallGraphOp @model.layers.12.mlp (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)])
-            linalg.CPU.AddOp <name="model.layers.12.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), )] (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %906:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)]) -> (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)])
-            cf.ReturnOp (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)]) -> ()
+        (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.12.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=475))] (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)])
+            graph.CallGraphOp @model.layers.12.self_attn (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)])
+            linalg.CPU.AddOp <name="model.layers.12.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), )] (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%8601:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=500))] (%8601:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)])
+            graph.CallGraphOp @model.layers.12.mlp (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
+            linalg.CPU.AddOp <name="model.layers.12.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), )] (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8601:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
+            cf.ReturnOp (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.12.self_attn <CPU> [using_qnn:true, symbol:model.layers.12.self_attn] {
-        (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)]) {
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.q_proj">(%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%874:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=479)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475))] (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%875:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=477))] (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%876:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=479), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=479), )] (%874:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=479)]) -> (%874:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=479)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=479), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=479), )] (%874:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=479)]) -> (%877:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=479)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), )] (%875:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)]) -> (%875:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), )] (%875:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)]) -> (%878:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478), )] (%876:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478)]) -> (%876:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478), )] (%876:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478)]) -> (%879:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=479), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), )] (%877:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=479)]) -> (%880:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482), )] (%878:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)]) -> (%881:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482)])
-            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), )] (%880:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%882:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)])
-            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482), )] (%881:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%883:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482), outputs_0:QuantSpec(Raw(type: Float16), uuid=484), )] (%883:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482)]) -> (%884:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=484)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=484), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485), )] (%884:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=484)]) -> (%885:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485), )] (%885:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)]) -> (%886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478), outputs_0:QuantSpec(Raw(type: Float16), uuid=486), )] (%879:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=478)]) -> (%887:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=486)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=486), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487), )] (%887:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=486)]) -> (%888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)])
-            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)]) -> (%889:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
-            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)]) -> (%890:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
-            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%889:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%891:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
-            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%890:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%892:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
-            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488), )] (%882:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)], %891:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%893:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488)])
-            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488), inputs_1:QuantSpec(Raw(type: Float32), uuid=489), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488), )] (%893:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488)], %894:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=489), constant:[0.088388346]]) -> (%895:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488)])
-            linalg.CPU.ReduceMinOp <name="model.layers.12.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490), )] (%895:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488)]) -> (%896:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490)])
-            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490), inputs_1:QuantSpec(Raw(type: Int16), uuid=491), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490), )] (%896:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490)], %897:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=491), constant:[-20]]) -> (%898:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490)])
-            linalg.CPU.EqualOp <name="model.layers.12.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=492), outputs_0:QuantSpec(Raw(type: UInt8), uuid=493), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %899:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=492), constant:[0.74609375]]) -> (%900:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=493)])
-            linalg.CPU.WhereOp <name="model.layers.12.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=493), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490), )] (%900:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=493)], %895:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=488)], %898:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490)]) -> (%901:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490)])
-            linalg.CPU.SoftmaxOp <name="model.layers.12.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=494), )] (%901:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=490)]) -> (%902:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=494)])
-            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=494), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), )] (%902:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=494)], %892:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), )] (%903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)]) -> (%904:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), )] (%904:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)]) -> (%904:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=496))] (%904:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)]) -> (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)])
-            cf.ReturnOp (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=485)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=487)]) -> ()
+        (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) {
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.q_proj">(%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%8569:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=476))] (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%8570:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=478))] (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%8571:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), )] (%8569:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)]) -> (%8569:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), )] (%8569:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)]) -> (%8572:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), )] (%8570:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%8570:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), )] (%8570:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%8573:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), )] (%8571:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%8571:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), )] (%8571:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%8574:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=482))] (%8572:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)]) -> (%8575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484))] (%8573:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%8576:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)])
+            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), )] (%8575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8577:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)])
+            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), )] (%8576:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8578:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), outputs_0:QuantSpec(Raw(type: Float16), uuid=485), )] (%8578:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)]) -> (%8579:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=485)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=485), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), )] (%8579:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=485)]) -> (%8580:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), )] (%8580:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)]) -> (%8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(Raw(type: Float16), uuid=487), )] (%8574:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%8582:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=487)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=487), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488), )] (%8582:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=487)]) -> (%8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)]) -> (%8584:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) -> (%8585:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
+            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%8584:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8586:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
+            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%8585:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8587:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
+            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), )] (%8577:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)], %8586:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8588:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), inputs_1:QuantSpec(Raw(type: Float32), uuid=490), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), )] (%8588:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)], %8589:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=490), constant:[0.088388346]]) -> (%8590:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
+            linalg.CPU.ReduceMinOp <name="model.layers.12.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%8590:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)]) -> (%8591:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), inputs_1:QuantSpec(Raw(type: Int16), uuid=492), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%8591:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)], %8592:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=492), constant:[-20]]) -> (%8593:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
+            linalg.CPU.EqualOp <name="model.layers.12.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=493), outputs_0:QuantSpec(Raw(type: UInt8), uuid=494), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8594:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=493), constant:[0.74609375]]) -> (%8595:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=494)])
+            linalg.CPU.WhereOp <name="model.layers.12.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=494), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%8595:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=494)], %8590:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)], %8593:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)]) -> (%8596:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
+            linalg.CPU.SoftmaxOp <name="model.layers.12.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), )] (%8596:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)]) -> (%8597:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)])
+            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), )] (%8597:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)], %8587:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8598:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), )] (%8598:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)]) -> (%8599:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), )] (%8599:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)]) -> (%8599:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=497))] (%8599:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)]) -> (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)])
+            cf.ReturnOp (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.12.mlp <CPU> [using_qnn:true, symbol:model.layers.12.mlp] {
-        (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)]) {
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=501), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=500))] (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%908:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=501)])
-            linalg.CPU.SiLUOp <name="model.layers.12.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=501), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), )] (%908:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=501)]) -> (%909:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)])
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=503))] (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%910:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)])
-            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), )] (%909:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)], %910:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)]) -> (%911:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)])
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=505))] (%911:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)]) -> (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)])
-            cf.ReturnOp (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)]) -> ()
+        (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) {
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=501))] (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8603:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)])
+            linalg.CPU.SiLUOp <name="model.layers.12.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), )] (%8603:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)]) -> (%8604:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=504))] (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8605:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505)])
+            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), )] (%8604:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)], %8605:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505)]) -> (%8606:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=506))] (%8606:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)]) -> (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
+            cf.ReturnOp (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.13 <CPU> [using_qnn:true, symbol:model.layers.13] {
-        (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.13.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), )] (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)]) -> (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
-            graph.CallGraphOp @model.layers.13.self_attn (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)])
-            linalg.CPU.AddOp <name="model.layers.13.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531), )] (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)], %913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)]) -> (%947:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), )] (%947:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)]) -> (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)])
-            graph.CallGraphOp @model.layers.13.mlp (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)])
-            linalg.CPU.AddOp <name="model.layers.13.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), )] (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %947:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)]) -> (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)])
-            cf.ReturnOp (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)]) -> ()
+        (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.13.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509))] (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)])
+            graph.CallGraphOp @model.layers.13.self_attn (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)])
+            linalg.CPU.AddOp <name="model.layers.13.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), )] (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%8642:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534))] (%8642:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)])
+            graph.CallGraphOp @model.layers.13.mlp (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
+            linalg.CPU.AddOp <name="model.layers.13.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), )] (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8642:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
+            cf.ReturnOp (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.13.self_attn <CPU> [using_qnn:true, symbol:model.layers.13.self_attn] {
-        (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)]) {
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.q_proj">(%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%915:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=513)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=509))] (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%916:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=511))] (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%917:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=513), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=513), )] (%915:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=513)]) -> (%915:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=513)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=513), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=513), )] (%915:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=513)]) -> (%918:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=513)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), )] (%916:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)]) -> (%916:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), )] (%916:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)]) -> (%919:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), )] (%917:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)]) -> (%917:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), )] (%917:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)]) -> (%920:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=513), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), )] (%918:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=513)]) -> (%921:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516), )] (%919:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)]) -> (%922:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516)])
-            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), )] (%921:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%923:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)])
-            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516), )] (%922:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%924:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516), outputs_0:QuantSpec(Raw(type: Float16), uuid=518), )] (%924:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=516)]) -> (%925:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=518)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=518), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519), )] (%925:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=518)]) -> (%926:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519), )] (%926:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)]) -> (%927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), outputs_0:QuantSpec(Raw(type: Float16), uuid=520), )] (%920:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)]) -> (%928:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=520)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=520), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521), )] (%928:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=520)]) -> (%929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)])
-            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)]) -> (%930:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
-            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)]) -> (%931:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
-            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%930:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%932:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
-            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%931:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%933:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
-            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), )] (%923:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %932:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%934:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)])
-            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), inputs_1:QuantSpec(Raw(type: Float32), uuid=523), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), )] (%934:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)], %935:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=523), constant:[0.088388346]]) -> (%936:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)])
-            linalg.CPU.ReduceMinOp <name="model.layers.13.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524), )] (%936:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)]) -> (%937:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524)])
-            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524), inputs_1:QuantSpec(Raw(type: Int16), uuid=525), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524), )] (%937:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524)], %938:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=525), constant:[-20]]) -> (%939:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524)])
-            linalg.CPU.EqualOp <name="model.layers.13.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=526), outputs_0:QuantSpec(Raw(type: UInt8), uuid=527), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %940:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=526), constant:[-0.78515625]]) -> (%941:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=527)])
-            linalg.CPU.WhereOp <name="model.layers.13.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=527), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524), )] (%941:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=527)], %936:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)], %939:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524)]) -> (%942:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524)])
-            linalg.CPU.SoftmaxOp <name="model.layers.13.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=528), )] (%942:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=524)]) -> (%943:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=528)])
-            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=528), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%943:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=528)], %933:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)]) -> (%945:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%945:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)]) -> (%945:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=530))] (%945:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)]) -> (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)])
-            cf.ReturnOp (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=531)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=519)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=521)]) -> ()
+        (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) {
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.q_proj">(%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)]) -> (%8610:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510))] (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)]) -> (%8611:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=512))] (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)]) -> (%8612:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), )] (%8610:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)]) -> (%8610:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), )] (%8610:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)]) -> (%8613:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), )] (%8611:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)]) -> (%8611:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), )] (%8611:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)]) -> (%8614:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), )] (%8612:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)]) -> (%8612:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), )] (%8612:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)]) -> (%8615:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=516))] (%8613:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)]) -> (%8616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=518))] (%8614:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)]) -> (%8617:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
+            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), )] (%8616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8618:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)])
+            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), )] (%8617:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8619:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), outputs_0:QuantSpec(Raw(type: Float16), uuid=519), )] (%8619:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)]) -> (%8620:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=519)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=519), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), )] (%8620:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=519)]) -> (%8621:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), )] (%8621:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)]) -> (%8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), outputs_0:QuantSpec(Raw(type: Float16), uuid=521), )] (%8615:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)]) -> (%8623:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=521)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=521), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522), )] (%8623:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=521)]) -> (%8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)]) -> (%8625:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) -> (%8626:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
+            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%8625:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8627:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
+            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%8626:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8628:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
+            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), )] (%8618:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)], %8627:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8629:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), inputs_1:QuantSpec(Raw(type: Float32), uuid=524), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), )] (%8629:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)], %8630:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=524), constant:[0.088388346]]) -> (%8631:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)])
+            linalg.CPU.ReduceMinOp <name="model.layers.13.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), )] (%8631:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)]) -> (%8632:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), inputs_1:QuantSpec(Raw(type: Int16), uuid=526), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), )] (%8632:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)], %8633:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=526), constant:[-20]]) -> (%8634:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)])
+            linalg.CPU.EqualOp <name="model.layers.13.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=527), outputs_0:QuantSpec(Raw(type: UInt8), uuid=528), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8635:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=527), constant:[-0.78515625]]) -> (%8636:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=528)])
+            linalg.CPU.WhereOp <name="model.layers.13.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=528), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), )] (%8636:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=528)], %8631:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)], %8634:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)]) -> (%8637:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)])
+            linalg.CPU.SoftmaxOp <name="model.layers.13.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%8637:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)]) -> (%8638:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
+            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), )] (%8638:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)], %8628:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8639:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), )] (%8639:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)]) -> (%8640:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), )] (%8640:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)]) -> (%8640:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=531))] (%8640:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)]) -> (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)])
+            cf.ReturnOp (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.13.mlp <CPU> [using_qnn:true, symbol:model.layers.13.mlp] {
-        (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)]) {
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=535), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=534))] (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%949:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=535)])
-            linalg.CPU.SiLUOp <name="model.layers.13.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=535), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), )] (%949:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=535)]) -> (%950:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)])
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=538), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=537))] (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%951:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=538)])
-            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=538), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), )] (%950:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)], %951:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=538)]) -> (%952:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)])
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=539))] (%952:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)]) -> (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)])
-            cf.ReturnOp (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)]) -> ()
+        (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) {
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=535))] (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8644:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)])
+            linalg.CPU.SiLUOp <name="model.layers.13.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), )] (%8644:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)]) -> (%8645:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=538))] (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8646:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539)])
+            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), )] (%8645:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)], %8646:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539)]) -> (%8647:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=540))] (%8647:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)]) -> (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
+            cf.ReturnOp (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.14 <CPU> [using_qnn:true, symbol:model.layers.14] {
-        (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.14.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), )] (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)]) -> (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
-            graph.CallGraphOp @model.layers.14.self_attn (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)])
-            linalg.CPU.AddOp <name="model.layers.14.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565), )] (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)], %954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)]) -> (%988:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), )] (%988:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)]) -> (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
-            graph.CallGraphOp @model.layers.14.mlp (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)])
-            linalg.CPU.AddOp <name="model.layers.14.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), )] (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %988:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)]) -> (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)])
-            cf.ReturnOp (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)]) -> ()
+        (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.14.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=543))] (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)])
+            graph.CallGraphOp @model.layers.14.self_attn (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
+            linalg.CPU.AddOp <name="model.layers.14.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), )] (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%8683:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=568))] (%8683:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)])
+            graph.CallGraphOp @model.layers.14.mlp (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
+            linalg.CPU.AddOp <name="model.layers.14.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), )] (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8683:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
+            cf.ReturnOp (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.14.self_attn <CPU> [using_qnn:true, symbol:model.layers.14.self_attn] {
-        (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)]) {
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.q_proj">(%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%956:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=547)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=543))] (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%957:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=545))] (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%958:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=547), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=547), )] (%956:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=547)]) -> (%956:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=547)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=547), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=547), )] (%956:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=547)]) -> (%959:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=547)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), )] (%957:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) -> (%957:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), )] (%957:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) -> (%960:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546), )] (%958:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546)]) -> (%958:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546), )] (%958:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546)]) -> (%961:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548), )] (%959:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=547)]) -> (%962:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550), )] (%960:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) -> (%963:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550)])
-            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548), )] (%962:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%964:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548)])
-            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550), )] (%963:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%965:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550), outputs_0:QuantSpec(Raw(type: Float16), uuid=552), )] (%965:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=550)]) -> (%966:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=552)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=552), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553), )] (%966:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=552)]) -> (%967:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553), )] (%967:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)]) -> (%968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546), outputs_0:QuantSpec(Raw(type: Float16), uuid=554), )] (%961:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=546)]) -> (%969:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=554)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555), )] (%969:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=554)]) -> (%970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)])
-            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)]) -> (%971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
-            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)]) -> (%972:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
-            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
-            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%972:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%974:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
-            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556), )] (%964:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=548)], %973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%975:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556)])
-            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556), inputs_1:QuantSpec(Raw(type: Float32), uuid=557), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556), )] (%975:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556)], %976:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=557), constant:[0.088388346]]) -> (%977:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556)])
-            linalg.CPU.ReduceMinOp <name="model.layers.14.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558), )] (%977:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556)]) -> (%978:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558)])
-            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558), inputs_1:QuantSpec(Raw(type: Int16), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558), )] (%978:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558)], %979:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=559), constant:[-20]]) -> (%980:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558)])
-            linalg.CPU.EqualOp <name="model.layers.14.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=560), outputs_0:QuantSpec(Raw(type: UInt8), uuid=561), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %981:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=560), constant:[-0.46289062]]) -> (%982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=561)])
-            linalg.CPU.WhereOp <name="model.layers.14.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=561), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558), )] (%982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=561)], %977:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=556)], %980:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558)]) -> (%983:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558)])
-            linalg.CPU.SoftmaxOp <name="model.layers.14.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=562), )] (%983:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=558)]) -> (%984:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=562)])
-            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=562), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), )] (%984:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=562)], %974:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), )] (%985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)]) -> (%986:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), )] (%986:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)]) -> (%986:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=564))] (%986:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)]) -> (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)])
-            cf.ReturnOp (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=565)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=553)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=555)]) -> ()
+        (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) {
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.q_proj">(%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%8651:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=544))] (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%8652:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=546))] (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%8653:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), )] (%8651:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)]) -> (%8651:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), )] (%8651:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)]) -> (%8654:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), )] (%8652:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%8652:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), )] (%8652:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%8655:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), )] (%8653:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%8653:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), )] (%8653:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%8656:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=550))] (%8654:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)]) -> (%8657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552))] (%8655:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%8658:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)])
+            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), )] (%8657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8659:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
+            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), )] (%8658:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8660:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), outputs_0:QuantSpec(Raw(type: Float16), uuid=553), )] (%8660:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)]) -> (%8661:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=553)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=553), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), )] (%8661:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=553)]) -> (%8662:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), )] (%8662:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)]) -> (%8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(Raw(type: Float16), uuid=555), )] (%8656:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%8664:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=555)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=555), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556), )] (%8664:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=555)]) -> (%8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)]) -> (%8666:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> (%8667:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
+            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%8666:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%8668:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
+            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%8667:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8669:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
+            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), )] (%8659:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)], %8668:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%8670:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), inputs_1:QuantSpec(Raw(type: Float32), uuid=558), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), )] (%8670:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)], %8671:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=558), constant:[0.088388346]]) -> (%8672:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)])
+            linalg.CPU.ReduceMinOp <name="model.layers.14.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%8672:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)]) -> (%8673:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), inputs_1:QuantSpec(Raw(type: Int16), uuid=560), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%8673:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)], %8674:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=560), constant:[-20]]) -> (%8675:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
+            linalg.CPU.EqualOp <name="model.layers.14.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=561), outputs_0:QuantSpec(Raw(type: UInt8), uuid=562), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8676:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=561), constant:[-0.46289062]]) -> (%8677:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=562)])
+            linalg.CPU.WhereOp <name="model.layers.14.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=562), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%8677:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=562)], %8672:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)], %8675:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)]) -> (%8678:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
+            linalg.CPU.SoftmaxOp <name="model.layers.14.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), )] (%8678:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)]) -> (%8679:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)])
+            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%8679:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)], %8669:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8680:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%8680:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%8681:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%8681:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%8681:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=565))] (%8681:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
+            cf.ReturnOp (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.14.mlp <CPU> [using_qnn:true, symbol:model.layers.14.mlp] {
-        (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) {
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=568))] (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%990:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569)])
-            linalg.CPU.SiLUOp <name="model.layers.14.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), )] (%990:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569)]) -> (%991:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)])
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=571))] (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%992:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572)])
-            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), )] (%991:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)], %992:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572)]) -> (%993:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)])
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=573))] (%993:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)]) -> (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)])
-            cf.ReturnOp (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) -> ()
+        (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) {
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=569))] (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8685:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)])
+            linalg.CPU.SiLUOp <name="model.layers.14.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), )] (%8685:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)]) -> (%8686:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=572))] (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8687:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573)])
+            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), )] (%8686:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)], %8687:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573)]) -> (%8688:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=574))] (%8688:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)]) -> (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
+            cf.ReturnOp (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.15 <CPU> [using_qnn:true, symbol:model.layers.15] {
-        (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.15.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), )] (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) -> (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
-            graph.CallGraphOp @model.layers.15.self_attn (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)])
-            linalg.CPU.AddOp <name="model.layers.15.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), )] (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)], %995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) -> (%1029:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), )] (%1029:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)]) -> (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
-            graph.CallGraphOp @model.layers.15.mlp (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)])
-            linalg.CPU.AddOp <name="model.layers.15.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608), )] (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)], %1029:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)]) -> (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)])
-            cf.ReturnOp (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)]) -> ()
+        (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.15.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577))] (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)])
+            graph.CallGraphOp @model.layers.15.self_attn (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)])
+            linalg.CPU.AddOp <name="model.layers.15.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), )] (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%8724:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602))] (%8724:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)])
+            graph.CallGraphOp @model.layers.15.mlp (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
+            linalg.CPU.AddOp <name="model.layers.15.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), )] (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8724:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
+            cf.ReturnOp (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.15.self_attn <CPU> [using_qnn:true, symbol:model.layers.15.self_attn] {
-        (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)]) {
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.q_proj">(%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%997:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=581)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=577))] (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%998:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=579))] (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%999:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=581), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=581), )] (%997:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=581)]) -> (%997:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=581)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=581), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=581), )] (%997:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=581)]) -> (%1000:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=581)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578), )] (%998:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578)]) -> (%998:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578), )] (%998:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578)]) -> (%1001:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580), )] (%999:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580)]) -> (%999:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580), )] (%999:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580)]) -> (%1002:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=581), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), )] (%1000:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=581)]) -> (%1003:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584), )] (%1001:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=578)]) -> (%1004:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584)])
-            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), )] (%1003:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1005:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)])
-            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584), )] (%1004:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1006:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584), outputs_0:QuantSpec(Raw(type: Float16), uuid=586), )] (%1006:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=584)]) -> (%1007:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=586)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=586), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587), )] (%1007:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=586)]) -> (%1008:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587), )] (%1008:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)]) -> (%1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580), outputs_0:QuantSpec(Raw(type: Float16), uuid=588), )] (%1002:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=580)]) -> (%1010:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=588)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=588), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589), )] (%1010:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=588)]) -> (%1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)])
-            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)]) -> (%1012:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
-            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)]) -> (%1013:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
-            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%1012:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%1014:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
-            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%1013:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1015:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
-            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590), )] (%1005:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)], %1014:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%1016:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590)])
-            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590), inputs_1:QuantSpec(Raw(type: Float32), uuid=591), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590), )] (%1016:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590)], %1017:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=591), constant:[0.088388346]]) -> (%1018:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590)])
-            linalg.CPU.ReduceMinOp <name="model.layers.15.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592), )] (%1018:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590)]) -> (%1019:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592)])
-            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592), inputs_1:QuantSpec(Raw(type: Int16), uuid=593), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592), )] (%1019:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592)], %1020:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=593), constant:[-20]]) -> (%1021:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592)])
-            linalg.CPU.EqualOp <name="model.layers.15.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=594), outputs_0:QuantSpec(Raw(type: UInt8), uuid=595), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1022:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=594), constant:[0.953125]]) -> (%1023:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=595)])
-            linalg.CPU.WhereOp <name="model.layers.15.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=595), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592), )] (%1023:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=595)], %1018:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=590)], %1021:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592)]) -> (%1024:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592)])
-            linalg.CPU.SoftmaxOp <name="model.layers.15.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), )] (%1024:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=592)]) -> (%1025:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)])
-            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), )] (%1025:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)], %1015:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), )] (%1026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1027:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), )] (%1027:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1027:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=598))] (%1027:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)])
-            cf.ReturnOp (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=587)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=589)]) -> ()
+        (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) {
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.q_proj">(%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)]) -> (%8692:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=578))] (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)]) -> (%8693:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580))] (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)]) -> (%8694:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), )] (%8692:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)]) -> (%8692:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), )] (%8692:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)]) -> (%8695:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), )] (%8693:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%8693:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), )] (%8693:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%8696:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), )] (%8694:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)]) -> (%8694:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), )] (%8694:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)]) -> (%8697:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=584))] (%8695:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)]) -> (%8698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=586))] (%8696:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%8699:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)])
+            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), )] (%8698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8700:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)])
+            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), )] (%8699:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8701:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), outputs_0:QuantSpec(Raw(type: Float16), uuid=587), )] (%8701:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)]) -> (%8702:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=587)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=587), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), )] (%8702:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=587)]) -> (%8703:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), )] (%8703:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)]) -> (%8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), outputs_0:QuantSpec(Raw(type: Float16), uuid=589), )] (%8697:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)]) -> (%8705:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=589)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=589), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590), )] (%8705:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=589)]) -> (%8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)]) -> (%8707:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) -> (%8708:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
+            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%8707:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%8709:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
+            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%8708:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8710:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
+            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), )] (%8700:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)], %8709:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%8711:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), inputs_1:QuantSpec(Raw(type: Float32), uuid=592), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), )] (%8711:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)], %8712:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=592), constant:[0.088388346]]) -> (%8713:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)])
+            linalg.CPU.ReduceMinOp <name="model.layers.15.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%8713:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)]) -> (%8714:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), inputs_1:QuantSpec(Raw(type: Int16), uuid=594), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%8714:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)], %8715:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=594), constant:[-20]]) -> (%8716:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
+            linalg.CPU.EqualOp <name="model.layers.15.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=595), outputs_0:QuantSpec(Raw(type: UInt8), uuid=596), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8717:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=595), constant:[0.953125]]) -> (%8718:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=596)])
+            linalg.CPU.WhereOp <name="model.layers.15.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=596), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%8718:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=596)], %8713:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)], %8716:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)]) -> (%8719:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
+            linalg.CPU.SoftmaxOp <name="model.layers.15.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), )] (%8719:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)]) -> (%8720:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)])
+            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), )] (%8720:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)], %8710:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8721:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), )] (%8721:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)]) -> (%8722:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), )] (%8722:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)]) -> (%8722:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=599))] (%8722:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)]) -> (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
+            cf.ReturnOp (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.15.mlp <CPU> [using_qnn:true, symbol:model.layers.15.mlp] {
-        (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)]) {
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=603), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=602))] (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%1031:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=603)])
-            linalg.CPU.SiLUOp <name="model.layers.15.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=603), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), )] (%1031:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=603)]) -> (%1032:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=606), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=605))] (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%1033:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=606)])
-            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=606), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), )] (%1032:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %1033:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=606)]) -> (%1034:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=607))] (%1034:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) -> (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)])
-            cf.ReturnOp (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)]) -> ()
+        (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) {
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=603))] (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8726:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
+            linalg.CPU.SiLUOp <name="model.layers.15.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), )] (%8726:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) -> (%8727:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=606))] (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8728:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)])
+            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), )] (%8727:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)], %8728:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)]) -> (%8729:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=608))] (%8729:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)]) -> (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
+            cf.ReturnOp (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.16 <CPU> [using_qnn:true, symbol:model.layers.16] {
-        (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.16.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), )] (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)]) -> (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            graph.CallGraphOp @model.layers.16.self_attn (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)])
-            linalg.CPU.AddOp <name="model.layers.16.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633), )] (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)], %1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=608)]) -> (%1070:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), )] (%1070:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)]) -> (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
-            graph.CallGraphOp @model.layers.16.mlp (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)])
-            linalg.CPU.AddOp <name="model.layers.16.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), )] (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %1070:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)]) -> (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)])
-            cf.ReturnOp (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)]) -> ()
+        (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.16.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611))] (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)])
+            graph.CallGraphOp @model.layers.16.self_attn (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)])
+            linalg.CPU.AddOp <name="model.layers.16.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), )] (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%8765:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=636))] (%8765:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)])
+            graph.CallGraphOp @model.layers.16.mlp (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
+            linalg.CPU.AddOp <name="model.layers.16.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), )] (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8765:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
+            cf.ReturnOp (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.16.self_attn <CPU> [using_qnn:true, symbol:model.layers.16.self_attn] {
-        (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)]) {
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.q_proj">(%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%1038:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=615)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=611))] (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%1039:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=613))] (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%1040:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=615), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=615), )] (%1038:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=615)]) -> (%1038:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=615)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=615), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=615), )] (%1038:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=615)]) -> (%1041:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=615)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), )] (%1039:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)]) -> (%1039:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), )] (%1039:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)]) -> (%1042:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614), )] (%1040:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614)]) -> (%1040:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614), )] (%1040:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614)]) -> (%1043:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=615), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616), )] (%1041:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=615)]) -> (%1044:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618), )] (%1042:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)]) -> (%1045:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618)])
-            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616), )] (%1044:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1046:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616)])
-            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618), )] (%1045:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1047:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618), outputs_0:QuantSpec(Raw(type: Float16), uuid=620), )] (%1047:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=618)]) -> (%1048:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=620)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=620), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621), )] (%1048:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=620)]) -> (%1049:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621), )] (%1049:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)]) -> (%1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614), outputs_0:QuantSpec(Raw(type: Float16), uuid=622), )] (%1043:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=614)]) -> (%1051:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=622)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=622), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623), )] (%1051:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=622)]) -> (%1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)])
-            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)]) -> (%1053:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
-            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)]) -> (%1054:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
-            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%1053:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%1055:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
-            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%1054:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1056:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
-            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), )] (%1046:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=616)], %1055:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%1057:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)])
-            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), inputs_1:QuantSpec(Raw(type: Float32), uuid=625), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), )] (%1057:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)], %1058:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=625), constant:[0.088388346]]) -> (%1059:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)])
-            linalg.CPU.ReduceMinOp <name="model.layers.16.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), )] (%1059:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)]) -> (%1060:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)])
-            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), inputs_1:QuantSpec(Raw(type: Int16), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), )] (%1060:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)], %1061:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=627), constant:[-20]]) -> (%1062:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)])
-            linalg.CPU.EqualOp <name="model.layers.16.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=628), outputs_0:QuantSpec(Raw(type: UInt8), uuid=629), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1063:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=628), constant:[0.118652344]]) -> (%1064:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=629)])
-            linalg.CPU.WhereOp <name="model.layers.16.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=629), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), )] (%1064:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=629)], %1059:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)], %1062:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)]) -> (%1065:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)])
-            linalg.CPU.SoftmaxOp <name="model.layers.16.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630), )] (%1065:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)]) -> (%1066:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630)])
-            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), )] (%1066:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630)], %1056:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), )] (%1067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)]) -> (%1068:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), )] (%1068:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)]) -> (%1068:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=632))] (%1068:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)]) -> (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)])
-            cf.ReturnOp (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=633)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=621)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=623)]) -> ()
+        (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) {
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.q_proj">(%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)]) -> (%8733:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=612))] (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)]) -> (%8734:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=614))] (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)]) -> (%8735:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), )] (%8733:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)]) -> (%8733:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), )] (%8733:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)]) -> (%8736:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), )] (%8734:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)]) -> (%8734:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), )] (%8734:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)]) -> (%8737:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), )] (%8735:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)]) -> (%8735:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), )] (%8735:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)]) -> (%8738:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=618))] (%8736:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)]) -> (%8739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=620))] (%8737:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)]) -> (%8740:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
+            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), )] (%8739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8741:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)])
+            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), )] (%8740:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8742:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), outputs_0:QuantSpec(Raw(type: Float16), uuid=621), )] (%8742:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)]) -> (%8743:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=621)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=621), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), )] (%8743:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=621)]) -> (%8744:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), )] (%8744:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)]) -> (%8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), outputs_0:QuantSpec(Raw(type: Float16), uuid=623), )] (%8738:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)]) -> (%8746:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=623)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=623), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624), )] (%8746:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=623)]) -> (%8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)]) -> (%8748:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) -> (%8749:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
+            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%8748:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%8750:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
+            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%8749:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8751:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
+            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), )] (%8741:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)], %8750:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%8752:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), inputs_1:QuantSpec(Raw(type: Float32), uuid=626), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), )] (%8752:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)], %8753:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=626), constant:[0.088388346]]) -> (%8754:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)])
+            linalg.CPU.ReduceMinOp <name="model.layers.16.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%8754:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)]) -> (%8755:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), inputs_1:QuantSpec(Raw(type: Int16), uuid=628), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%8755:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)], %8756:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=628), constant:[-20]]) -> (%8757:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
+            linalg.CPU.EqualOp <name="model.layers.16.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=629), outputs_0:QuantSpec(Raw(type: UInt8), uuid=630), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8758:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=629), constant:[0.118652344]]) -> (%8759:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=630)])
+            linalg.CPU.WhereOp <name="model.layers.16.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=630), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%8759:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=630)], %8754:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)], %8757:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%8760:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
+            linalg.CPU.SoftmaxOp <name="model.layers.16.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), )] (%8760:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%8761:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)])
+            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), )] (%8761:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)], %8751:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8762:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), )] (%8762:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%8763:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), )] (%8763:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%8763:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=633))] (%8763:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
+            cf.ReturnOp (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.16.mlp <CPU> [using_qnn:true, symbol:model.layers.16.mlp] {
-        (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)]) {
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=636))] (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%1072:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)])
-            linalg.CPU.SiLUOp <name="model.layers.16.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), )] (%1072:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)]) -> (%1073:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)])
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=640), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=639))] (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%1074:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=640)])
-            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=640), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), )] (%1073:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)], %1074:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=640)]) -> (%1075:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)])
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=641))] (%1075:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)]) -> (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)])
-            cf.ReturnOp (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)]) -> ()
+        (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) {
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=637))] (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8767:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)])
+            linalg.CPU.SiLUOp <name="model.layers.16.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), )] (%8767:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)]) -> (%8768:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=640))] (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8769:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)])
+            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), )] (%8768:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)], %8769:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)]) -> (%8770:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=642))] (%8770:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)]) -> (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
+            cf.ReturnOp (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.17 <CPU> [using_qnn:true, symbol:model.layers.17] {
-        (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.17.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), )] (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)]) -> (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
-            graph.CallGraphOp @model.layers.17.self_attn (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)])
-            linalg.CPU.AddOp <name="model.layers.17.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), )] (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)], %1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)]) -> (%1111:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), )] (%1111:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)]) -> (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)])
-            graph.CallGraphOp @model.layers.17.mlp (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)])
-            linalg.CPU.AddOp <name="model.layers.17.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676), )] (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)], %1111:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)]) -> (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)])
-            cf.ReturnOp (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)]) -> ()
+        (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.17.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=645))] (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)])
+            graph.CallGraphOp @model.layers.17.self_attn (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)])
+            linalg.CPU.AddOp <name="model.layers.17.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), )] (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%8806:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=670))] (%8806:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)])
+            graph.CallGraphOp @model.layers.17.mlp (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
+            linalg.CPU.AddOp <name="model.layers.17.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), )] (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8806:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
+            cf.ReturnOp (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.17.self_attn <CPU> [using_qnn:true, symbol:model.layers.17.self_attn] {
-        (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)]) {
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.q_proj">(%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%1079:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=649)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=645))] (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%1080:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=647))] (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%1081:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=649), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=649), )] (%1079:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=649)]) -> (%1079:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=649)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=649), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=649), )] (%1079:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=649)]) -> (%1082:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=649)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646), )] (%1080:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646)]) -> (%1080:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646), )] (%1080:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646)]) -> (%1083:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648), )] (%1081:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648)]) -> (%1081:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648), )] (%1081:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648)]) -> (%1084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650), )] (%1082:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=649)]) -> (%1085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652), )] (%1083:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=646)]) -> (%1086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652)])
-            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650), )] (%1085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1087:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650)])
-            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652), )] (%1086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1088:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652), outputs_0:QuantSpec(Raw(type: Float16), uuid=654), )] (%1088:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=652)]) -> (%1089:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=654)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=654), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655), )] (%1089:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=654)]) -> (%1090:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655), )] (%1090:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)]) -> (%1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648), outputs_0:QuantSpec(Raw(type: Float16), uuid=656), )] (%1084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=648)]) -> (%1092:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=656)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657), )] (%1092:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=656)]) -> (%1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)])
-            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)]) -> (%1094:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
-            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)]) -> (%1095:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
-            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%1094:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%1096:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
-            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%1095:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1097:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
-            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658), )] (%1087:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=650)], %1096:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%1098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658)])
-            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658), inputs_1:QuantSpec(Raw(type: Float32), uuid=659), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658), )] (%1098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658)], %1099:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=659), constant:[0.088388346]]) -> (%1100:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658)])
-            linalg.CPU.ReduceMinOp <name="model.layers.17.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), )] (%1100:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658)]) -> (%1101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)])
-            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), inputs_1:QuantSpec(Raw(type: Int16), uuid=661), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), )] (%1101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)], %1102:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=661), constant:[-20]]) -> (%1103:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)])
-            linalg.CPU.EqualOp <name="model.layers.17.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=662), outputs_0:QuantSpec(Raw(type: UInt8), uuid=663), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1104:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=662), constant:[-0.99609375]]) -> (%1105:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=663)])
-            linalg.CPU.WhereOp <name="model.layers.17.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=663), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), )] (%1105:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=663)], %1100:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=658)], %1103:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)]) -> (%1106:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)])
-            linalg.CPU.SoftmaxOp <name="model.layers.17.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), )] (%1106:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)]) -> (%1107:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)])
-            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), )] (%1107:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %1097:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), )] (%1108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)]) -> (%1109:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), )] (%1109:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)]) -> (%1109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=666))] (%1109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)]) -> (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)])
-            cf.ReturnOp (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=655)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=657)]) -> ()
+        (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) {
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.q_proj">(%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)]) -> (%8774:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=646))] (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)]) -> (%8775:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=648))] (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)]) -> (%8776:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), )] (%8774:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)]) -> (%8774:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), )] (%8774:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)]) -> (%8777:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), )] (%8775:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%8775:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), )] (%8775:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%8778:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%8776:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%8776:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%8776:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%8779:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=652))] (%8777:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)]) -> (%8780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654))] (%8778:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%8781:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)])
+            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), )] (%8780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8782:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)])
+            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), )] (%8781:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8783:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), outputs_0:QuantSpec(Raw(type: Float16), uuid=655), )] (%8783:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)]) -> (%8784:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=655)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=655), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), )] (%8784:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=655)]) -> (%8785:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), )] (%8785:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)]) -> (%8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(Raw(type: Float16), uuid=657), )] (%8779:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%8787:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=657)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=657), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658), )] (%8787:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=657)]) -> (%8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)]) -> (%8789:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) -> (%8790:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
+            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%8789:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%8791:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
+            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%8790:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8792:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
+            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), )] (%8782:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)], %8791:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%8793:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), inputs_1:QuantSpec(Raw(type: Float32), uuid=660), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), )] (%8793:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)], %8794:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=660), constant:[0.088388346]]) -> (%8795:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)])
+            linalg.CPU.ReduceMinOp <name="model.layers.17.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), )] (%8795:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)]) -> (%8796:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), inputs_1:QuantSpec(Raw(type: Int16), uuid=662), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), )] (%8796:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)], %8797:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=662), constant:[-20]]) -> (%8798:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)])
+            linalg.CPU.EqualOp <name="model.layers.17.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=663), outputs_0:QuantSpec(Raw(type: UInt8), uuid=664), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8799:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=663), constant:[-0.99609375]]) -> (%8800:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=664)])
+            linalg.CPU.WhereOp <name="model.layers.17.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=664), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), )] (%8800:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=664)], %8795:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)], %8798:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)]) -> (%8801:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)])
+            linalg.CPU.SoftmaxOp <name="model.layers.17.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), )] (%8801:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)]) -> (%8802:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)])
+            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), )] (%8802:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)], %8792:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8803:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), )] (%8803:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)]) -> (%8804:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), )] (%8804:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)]) -> (%8804:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667))] (%8804:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)]) -> (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)])
+            cf.ReturnOp (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.17.mlp <CPU> [using_qnn:true, symbol:model.layers.17.mlp] {
-        (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)]) {
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=670))] (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%1113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671)])
-            linalg.CPU.SiLUOp <name="model.layers.17.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), )] (%1113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671)]) -> (%1114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)])
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=674), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=673))] (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%1115:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=674)])
-            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=674), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), )] (%1114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)], %1115:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=674)]) -> (%1116:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)])
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=675))] (%1116:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)]) -> (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)])
-            cf.ReturnOp (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)]) -> ()
+        (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) {
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=671))] (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8808:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)])
+            linalg.CPU.SiLUOp <name="model.layers.17.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), )] (%8808:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)]) -> (%8809:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=674))] (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8810:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675)])
+            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), )] (%8809:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)], %8810:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675)]) -> (%8811:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=676))] (%8811:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)]) -> (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
+            cf.ReturnOp (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.18 <CPU> [using_qnn:true, symbol:model.layers.18] {
-        (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.18.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), )] (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)]) -> (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
-            graph.CallGraphOp @model.layers.18.self_attn (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)])
-            linalg.CPU.AddOp <name="model.layers.18.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), )] (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)], %1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=676)]) -> (%1152:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), )] (%1152:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)]) -> (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
-            graph.CallGraphOp @model.layers.18.mlp (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)])
-            linalg.CPU.AddOp <name="model.layers.18.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710), )] (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)], %1152:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)]) -> (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)])
-            cf.ReturnOp (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)]) -> ()
+        (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.18.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679))] (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)])
+            graph.CallGraphOp @model.layers.18.self_attn (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)])
+            linalg.CPU.AddOp <name="model.layers.18.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), )] (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%8847:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=704))] (%8847:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)])
+            graph.CallGraphOp @model.layers.18.mlp (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
+            linalg.CPU.AddOp <name="model.layers.18.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), )] (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8847:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
+            cf.ReturnOp (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.18.self_attn <CPU> [using_qnn:true, symbol:model.layers.18.self_attn] {
-        (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)]) {
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.q_proj">(%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%1120:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=683)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=679))] (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%1121:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=681))] (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%1122:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=683), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=683), )] (%1120:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=683)]) -> (%1120:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=683)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=683), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=683), )] (%1120:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=683)]) -> (%1123:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=683)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680), )] (%1121:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680)]) -> (%1121:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680), )] (%1121:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680)]) -> (%1124:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682), )] (%1122:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682)]) -> (%1122:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682), )] (%1122:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682)]) -> (%1125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=683), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), )] (%1123:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=683)]) -> (%1126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), )] (%1124:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=680)]) -> (%1127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)])
-            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), )] (%1126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1128:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)])
-            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), )] (%1127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1129:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), outputs_0:QuantSpec(Raw(type: Float16), uuid=688), )] (%1129:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)]) -> (%1130:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=688)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=688), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689), )] (%1130:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=688)]) -> (%1131:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689), )] (%1131:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)]) -> (%1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682), outputs_0:QuantSpec(Raw(type: Float16), uuid=690), )] (%1125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=682)]) -> (%1133:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=690)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=690), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691), )] (%1133:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=690)]) -> (%1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)])
-            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)]) -> (%1135:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
-            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)]) -> (%1136:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
-            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%1135:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%1137:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
-            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%1136:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1138:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
-            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), )] (%1128:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)], %1137:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%1139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)])
-            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), inputs_1:QuantSpec(Raw(type: Float32), uuid=693), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), )] (%1139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)], %1140:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=693), constant:[0.088388346]]) -> (%1141:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)])
-            linalg.CPU.ReduceMinOp <name="model.layers.18.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), )] (%1141:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)]) -> (%1142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)])
-            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), inputs_1:QuantSpec(Raw(type: Int16), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), )] (%1142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %1143:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=695), constant:[-20]]) -> (%1144:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)])
-            linalg.CPU.EqualOp <name="model.layers.18.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=696), outputs_0:QuantSpec(Raw(type: UInt8), uuid=697), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1145:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=696), constant:[0.24023438]]) -> (%1146:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=697)])
-            linalg.CPU.WhereOp <name="model.layers.18.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=697), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), )] (%1146:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=697)], %1141:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)], %1144:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)]) -> (%1147:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)])
-            linalg.CPU.SoftmaxOp <name="model.layers.18.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=698), )] (%1147:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)]) -> (%1148:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=698)])
-            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=698), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%1148:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=698)], %1138:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%1149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)]) -> (%1150:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%1150:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)]) -> (%1150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=700))] (%1150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)]) -> (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)])
-            cf.ReturnOp (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=689)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=691)]) -> ()
+        (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) {
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.q_proj">(%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)]) -> (%8815:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=680))] (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)]) -> (%8816:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=682))] (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)]) -> (%8817:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), )] (%8815:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)]) -> (%8815:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), )] (%8815:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)]) -> (%8818:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), )] (%8816:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)]) -> (%8816:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), )] (%8816:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)]) -> (%8819:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), )] (%8817:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)]) -> (%8817:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), )] (%8817:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)]) -> (%8820:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=686))] (%8818:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)]) -> (%8821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=688))] (%8819:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)]) -> (%8822:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)])
+            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), )] (%8821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8823:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)])
+            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), )] (%8822:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8824:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), outputs_0:QuantSpec(Raw(type: Float16), uuid=689), )] (%8824:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)]) -> (%8825:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=689)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=689), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), )] (%8825:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=689)]) -> (%8826:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), )] (%8826:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)]) -> (%8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), outputs_0:QuantSpec(Raw(type: Float16), uuid=691), )] (%8820:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)]) -> (%8828:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=691)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=691), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692), )] (%8828:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=691)]) -> (%8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)]) -> (%8830:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) -> (%8831:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
+            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%8830:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%8832:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
+            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%8831:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8833:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
+            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), )] (%8823:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)], %8832:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%8834:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), inputs_1:QuantSpec(Raw(type: Float32), uuid=694), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), )] (%8834:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)], %8835:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=694), constant:[0.088388346]]) -> (%8836:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)])
+            linalg.CPU.ReduceMinOp <name="model.layers.18.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%8836:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)]) -> (%8837:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), inputs_1:QuantSpec(Raw(type: Int16), uuid=696), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%8837:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)], %8838:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=696), constant:[-20]]) -> (%8839:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
+            linalg.CPU.EqualOp <name="model.layers.18.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=697), outputs_0:QuantSpec(Raw(type: UInt8), uuid=698), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8840:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=697), constant:[0.24023438]]) -> (%8841:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=698)])
+            linalg.CPU.WhereOp <name="model.layers.18.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=698), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%8841:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=698)], %8836:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)], %8839:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%8842:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
+            linalg.CPU.SoftmaxOp <name="model.layers.18.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%8842:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%8843:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
+            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), )] (%8843:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)], %8833:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8844:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), )] (%8844:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)]) -> (%8845:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), )] (%8845:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)]) -> (%8845:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=701))] (%8845:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)]) -> (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
+            cf.ReturnOp (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.18.mlp <CPU> [using_qnn:true, symbol:model.layers.18.mlp] {
-        (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)]) {
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=705), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=704))] (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%1154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=705)])
-            linalg.CPU.SiLUOp <name="model.layers.18.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=705), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), )] (%1154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=705)]) -> (%1155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)])
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=708), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=707))] (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%1156:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=708)])
-            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=708), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), )] (%1155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)], %1156:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=708)]) -> (%1157:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)])
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=709))] (%1157:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)]) -> (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)])
-            cf.ReturnOp (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)]) -> ()
+        (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) {
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=705))] (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8849:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)])
+            linalg.CPU.SiLUOp <name="model.layers.18.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), )] (%8849:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)]) -> (%8850:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=708))] (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8851:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)])
+            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), )] (%8850:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)], %8851:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)]) -> (%8852:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=710))] (%8852:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)]) -> (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
+            cf.ReturnOp (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.19 <CPU> [using_qnn:true, symbol:model.layers.19] {
-        (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.19.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), )] (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)]) -> (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
-            graph.CallGraphOp @model.layers.19.self_attn (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)])
-            linalg.CPU.AddOp <name="model.layers.19.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735), )] (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)], %1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=710)]) -> (%1193:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), )] (%1193:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)]) -> (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)])
-            graph.CallGraphOp @model.layers.19.mlp (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)])
-            linalg.CPU.AddOp <name="model.layers.19.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), )] (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)], %1193:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)]) -> (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)])
-            cf.ReturnOp (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)]) -> ()
+        (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.19.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713))] (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)])
+            graph.CallGraphOp @model.layers.19.self_attn (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)])
+            linalg.CPU.AddOp <name="model.layers.19.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), )] (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%8888:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=738))] (%8888:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)])
+            graph.CallGraphOp @model.layers.19.mlp (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
+            linalg.CPU.AddOp <name="model.layers.19.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), )] (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8888:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
+            cf.ReturnOp (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.19.self_attn <CPU> [using_qnn:true, symbol:model.layers.19.self_attn] {
-        (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)]) {
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.q_proj">(%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%1161:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=717)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=713))] (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%1162:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=715))] (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%1163:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=717), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=717), )] (%1161:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=717)]) -> (%1161:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=717)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=717), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=717), )] (%1161:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=717)]) -> (%1164:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=717)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), )] (%1162:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)]) -> (%1162:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), )] (%1162:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)]) -> (%1165:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), )] (%1163:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)]) -> (%1163:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), )] (%1163:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)]) -> (%1166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718), )] (%1164:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=717)]) -> (%1167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), )] (%1165:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)]) -> (%1168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)])
-            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718), )] (%1167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1169:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718)])
-            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), )] (%1168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1170:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), outputs_0:QuantSpec(Raw(type: Float16), uuid=722), )] (%1170:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)]) -> (%1171:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=722)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=722), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723), )] (%1171:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=722)]) -> (%1172:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723), )] (%1172:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)]) -> (%1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), outputs_0:QuantSpec(Raw(type: Float16), uuid=724), )] (%1166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)]) -> (%1174:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=724)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725), )] (%1174:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=724)]) -> (%1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)])
-            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)]) -> (%1176:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
-            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)]) -> (%1177:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
-            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%1176:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%1178:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
-            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%1177:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1179:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
-            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726), )] (%1169:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=718)], %1178:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%1180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726)])
-            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726), inputs_1:QuantSpec(Raw(type: Float32), uuid=727), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726), )] (%1180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726)], %1181:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=727), constant:[0.088388346]]) -> (%1182:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726)])
-            linalg.CPU.ReduceMinOp <name="model.layers.19.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728), )] (%1182:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726)]) -> (%1183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728)])
-            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728), inputs_1:QuantSpec(Raw(type: Int16), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728), )] (%1183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728)], %1184:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=729), constant:[-20]]) -> (%1185:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728)])
-            linalg.CPU.EqualOp <name="model.layers.19.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=730), outputs_0:QuantSpec(Raw(type: UInt8), uuid=731), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1186:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=730), constant:[0.55078125]]) -> (%1187:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=731)])
-            linalg.CPU.WhereOp <name="model.layers.19.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=731), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728), )] (%1187:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=731)], %1182:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=726)], %1185:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728)]) -> (%1188:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728)])
-            linalg.CPU.SoftmaxOp <name="model.layers.19.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732), )] (%1188:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=728)]) -> (%1189:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732)])
-            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), )] (%1189:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732)], %1179:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), )] (%1190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)]) -> (%1191:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), )] (%1191:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)]) -> (%1191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=734))] (%1191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)]) -> (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)])
-            cf.ReturnOp (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=735)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=723)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=725)]) -> ()
+        (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) {
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.q_proj">(%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)]) -> (%8856:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=714))] (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)]) -> (%8857:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=716))] (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)]) -> (%8858:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), )] (%8856:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)]) -> (%8856:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), )] (%8856:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)]) -> (%8859:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), )] (%8857:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)]) -> (%8857:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), )] (%8857:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)]) -> (%8860:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), )] (%8858:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%8858:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), )] (%8858:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%8861:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=720))] (%8859:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)]) -> (%8862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722))] (%8860:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)]) -> (%8863:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)])
+            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), )] (%8862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8864:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)])
+            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), )] (%8863:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8865:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), outputs_0:QuantSpec(Raw(type: Float16), uuid=723), )] (%8865:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)]) -> (%8866:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=723)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=723), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), )] (%8866:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=723)]) -> (%8867:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), )] (%8867:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)]) -> (%8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(Raw(type: Float16), uuid=725), )] (%8861:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%8869:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=725)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=725), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726), )] (%8869:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=725)]) -> (%8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)]) -> (%8871:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) -> (%8872:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
+            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%8871:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%8873:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
+            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%8872:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8874:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
+            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), )] (%8864:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)], %8873:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%8875:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), inputs_1:QuantSpec(Raw(type: Float32), uuid=728), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), )] (%8875:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)], %8876:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=728), constant:[0.088388346]]) -> (%8877:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
+            linalg.CPU.ReduceMinOp <name="model.layers.19.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%8877:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)]) -> (%8878:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), inputs_1:QuantSpec(Raw(type: Int16), uuid=730), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%8878:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)], %8879:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=730), constant:[-20]]) -> (%8880:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
+            linalg.CPU.EqualOp <name="model.layers.19.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=731), outputs_0:QuantSpec(Raw(type: UInt8), uuid=732), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8881:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=731), constant:[0.55078125]]) -> (%8882:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=732)])
+            linalg.CPU.WhereOp <name="model.layers.19.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=732), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%8882:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=732)], %8877:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)], %8880:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%8883:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
+            linalg.CPU.SoftmaxOp <name="model.layers.19.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), )] (%8883:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%8884:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)])
+            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), )] (%8884:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)], %8874:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8885:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), )] (%8885:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)]) -> (%8886:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), )] (%8886:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)]) -> (%8886:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735))] (%8886:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)]) -> (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)])
+            cf.ReturnOp (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.19.mlp <CPU> [using_qnn:true, symbol:model.layers.19.mlp] {
-        (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) {
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=738))] (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%1195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)])
-            linalg.CPU.SiLUOp <name="model.layers.19.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), )] (%1195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)]) -> (%1196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)])
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=742), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=741))] (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%1197:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=742)])
-            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=742), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), )] (%1196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)], %1197:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=742)]) -> (%1198:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)])
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=743))] (%1198:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)]) -> (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)])
-            cf.ReturnOp (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) -> ()
+        (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) {
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=739))] (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8890:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)])
+            linalg.CPU.SiLUOp <name="model.layers.19.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), )] (%8890:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)]) -> (%8891:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=742))] (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8892:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743)])
+            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), )] (%8891:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)], %8892:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743)]) -> (%8893:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=744))] (%8893:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)]) -> (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
+            cf.ReturnOp (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.20 <CPU> [using_qnn:true, symbol:model.layers.20] {
-        (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.20.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), )] (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) -> (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
-            graph.CallGraphOp @model.layers.20.self_attn (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)])
-            linalg.CPU.AddOp <name="model.layers.20.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), )] (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)], %1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) -> (%1234:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), )] (%1234:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)]) -> (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)])
-            graph.CallGraphOp @model.layers.20.mlp (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)])
-            linalg.CPU.AddOp <name="model.layers.20.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778), )] (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)], %1234:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)]) -> (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)])
-            cf.ReturnOp (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)]) -> ()
+        (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.20.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747))] (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)])
+            graph.CallGraphOp @model.layers.20.self_attn (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)])
+            linalg.CPU.AddOp <name="model.layers.20.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), )] (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%8929:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=772))] (%8929:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)])
+            graph.CallGraphOp @model.layers.20.mlp (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
+            linalg.CPU.AddOp <name="model.layers.20.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), )] (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8929:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
+            cf.ReturnOp (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.20.self_attn <CPU> [using_qnn:true, symbol:model.layers.20.self_attn] {
-        (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)]) {
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.q_proj">(%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%1202:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=751)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=747))] (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%1203:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=749))] (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%1204:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=751), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=751), )] (%1202:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=751)]) -> (%1202:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=751)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=751), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=751), )] (%1202:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=751)]) -> (%1205:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=751)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748), )] (%1203:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748)]) -> (%1203:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748), )] (%1203:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748)]) -> (%1206:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), )] (%1204:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)]) -> (%1204:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), )] (%1204:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)]) -> (%1207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=751), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752), )] (%1205:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=751)]) -> (%1208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), )] (%1206:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=748)]) -> (%1209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)])
-            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752), )] (%1208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1210:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752)])
-            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), )] (%1209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1211:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), outputs_0:QuantSpec(Raw(type: Float16), uuid=756), )] (%1211:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)]) -> (%1212:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=756)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=756), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757), )] (%1212:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=756)]) -> (%1213:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757), )] (%1213:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)]) -> (%1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), outputs_0:QuantSpec(Raw(type: Float16), uuid=758), )] (%1207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)]) -> (%1215:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=758)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=758), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759), )] (%1215:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=758)]) -> (%1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)])
-            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)]) -> (%1217:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
-            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)]) -> (%1218:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
-            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%1217:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%1219:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
-            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%1218:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1220:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
-            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760), )] (%1210:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752)], %1219:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%1221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760)])
-            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760), inputs_1:QuantSpec(Raw(type: Float32), uuid=761), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760), )] (%1221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760)], %1222:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=761), constant:[0.088388346]]) -> (%1223:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760)])
-            linalg.CPU.ReduceMinOp <name="model.layers.20.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), )] (%1223:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760)]) -> (%1224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)])
-            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), inputs_1:QuantSpec(Raw(type: Int16), uuid=763), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), )] (%1224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)], %1225:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=763), constant:[-20]]) -> (%1226:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)])
-            linalg.CPU.EqualOp <name="model.layers.20.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=764), outputs_0:QuantSpec(Raw(type: UInt8), uuid=765), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1227:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=764), constant:[0.71875]]) -> (%1228:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=765)])
-            linalg.CPU.WhereOp <name="model.layers.20.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=765), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), )] (%1228:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=765)], %1223:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=760)], %1226:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)]) -> (%1229:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)])
-            linalg.CPU.SoftmaxOp <name="model.layers.20.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=766), )] (%1229:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)]) -> (%1230:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=766)])
-            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=766), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%1230:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=766)], %1220:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1231:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%1231:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)]) -> (%1232:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%1232:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)]) -> (%1232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=768))] (%1232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)]) -> (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)])
-            cf.ReturnOp (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=757)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=759)]) -> ()
+        (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) {
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.q_proj">(%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%8897:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=748))] (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%8898:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=750))] (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%8899:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), )] (%8897:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)]) -> (%8897:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), )] (%8897:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)]) -> (%8900:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), )] (%8898:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%8898:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), )] (%8898:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%8901:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), )] (%8899:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)]) -> (%8899:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), )] (%8899:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)]) -> (%8902:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=754))] (%8900:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)]) -> (%8903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=756))] (%8901:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%8904:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)])
+            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), )] (%8903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8905:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)])
+            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), )] (%8904:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8906:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), outputs_0:QuantSpec(Raw(type: Float16), uuid=757), )] (%8906:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)]) -> (%8907:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=757)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), )] (%8907:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=757)]) -> (%8908:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), )] (%8908:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)]) -> (%8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), outputs_0:QuantSpec(Raw(type: Float16), uuid=759), )] (%8902:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)]) -> (%8910:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=759)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=759), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760), )] (%8910:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=759)]) -> (%8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)]) -> (%8912:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) -> (%8913:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
+            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%8912:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%8914:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
+            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%8913:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8915:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
+            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), )] (%8905:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)], %8914:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%8916:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), inputs_1:QuantSpec(Raw(type: Float32), uuid=762), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), )] (%8916:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)], %8917:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=762), constant:[0.088388346]]) -> (%8918:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)])
+            linalg.CPU.ReduceMinOp <name="model.layers.20.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), )] (%8918:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)]) -> (%8919:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), inputs_1:QuantSpec(Raw(type: Int16), uuid=764), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), )] (%8919:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)], %8920:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=764), constant:[-20]]) -> (%8921:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)])
+            linalg.CPU.EqualOp <name="model.layers.20.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=765), outputs_0:QuantSpec(Raw(type: UInt8), uuid=766), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8922:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=765), constant:[0.71875]]) -> (%8923:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=766)])
+            linalg.CPU.WhereOp <name="model.layers.20.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=766), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), )] (%8923:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=766)], %8918:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)], %8921:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)]) -> (%8924:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)])
+            linalg.CPU.SoftmaxOp <name="model.layers.20.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%8924:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)]) -> (%8925:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
+            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), )] (%8925:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)], %8915:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8926:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), )] (%8926:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)]) -> (%8927:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), )] (%8927:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)]) -> (%8927:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=769))] (%8927:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)]) -> (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)])
+            cf.ReturnOp (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.20.mlp <CPU> [using_qnn:true, symbol:model.layers.20.mlp] {
-        (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)]) {
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=772))] (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%1236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773)])
-            linalg.CPU.SiLUOp <name="model.layers.20.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), )] (%1236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773)]) -> (%1237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=775))] (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%1238:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)])
-            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), )] (%1237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)], %1238:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)]) -> (%1239:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=777))] (%1239:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)]) -> (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)])
-            cf.ReturnOp (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)]) -> ()
+        (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) {
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=773))] (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8931:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
+            linalg.CPU.SiLUOp <name="model.layers.20.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), )] (%8931:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)]) -> (%8932:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=776))] (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8933:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)])
+            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), )] (%8932:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)], %8933:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)]) -> (%8934:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=778))] (%8934:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)]) -> (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
+            cf.ReturnOp (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.21 <CPU> [using_qnn:true, symbol:model.layers.21] {
-        (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.21.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), )] (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)]) -> (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
-            graph.CallGraphOp @model.layers.21.self_attn (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)])
-            linalg.CPU.AddOp <name="model.layers.21.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), )] (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)], %1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=778)]) -> (%1275:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), )] (%1275:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)]) -> (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
-            graph.CallGraphOp @model.layers.21.mlp (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)])
-            linalg.CPU.AddOp <name="model.layers.21.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), )] (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)], %1275:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)]) -> (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)])
-            cf.ReturnOp (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)]) -> ()
+        (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.21.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=781))] (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)])
+            graph.CallGraphOp @model.layers.21.self_attn (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
+            linalg.CPU.AddOp <name="model.layers.21.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), )] (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%8970:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806))] (%8970:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)])
+            graph.CallGraphOp @model.layers.21.mlp (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
+            linalg.CPU.AddOp <name="model.layers.21.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), )] (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8970:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
+            cf.ReturnOp (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.21.self_attn <CPU> [using_qnn:true, symbol:model.layers.21.self_attn] {
-        (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)]) {
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.q_proj">(%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%1243:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=785)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=781))] (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%1244:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=783))] (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%1245:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=785), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=785), )] (%1243:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=785)]) -> (%1243:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=785)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=785), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=785), )] (%1243:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=785)]) -> (%1246:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=785)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), )] (%1244:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)]) -> (%1244:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), )] (%1244:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)]) -> (%1247:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), )] (%1245:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) -> (%1245:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), )] (%1245:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) -> (%1248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786), )] (%1246:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=785)]) -> (%1249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788), )] (%1247:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)]) -> (%1250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788)])
-            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786), )] (%1249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1251:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786)])
-            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788), )] (%1250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1252:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788), outputs_0:QuantSpec(Raw(type: Float16), uuid=790), )] (%1252:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=788)]) -> (%1253:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=790)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=790), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791), )] (%1253:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=790)]) -> (%1254:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791), )] (%1254:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)]) -> (%1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), outputs_0:QuantSpec(Raw(type: Float16), uuid=792), )] (%1248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) -> (%1256:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=792)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=792), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793), )] (%1256:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=792)]) -> (%1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)])
-            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)]) -> (%1258:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
-            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)]) -> (%1259:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
-            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%1258:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%1260:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
-            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%1259:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1261:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
-            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794), )] (%1251:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=786)], %1260:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%1262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794)])
-            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794), inputs_1:QuantSpec(Raw(type: Float32), uuid=795), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794), )] (%1262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794)], %1263:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=795), constant:[0.088388346]]) -> (%1264:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794)])
-            linalg.CPU.ReduceMinOp <name="model.layers.21.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796), )] (%1264:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794)]) -> (%1265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796)])
-            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796), inputs_1:QuantSpec(Raw(type: Int16), uuid=797), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796), )] (%1265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796)], %1266:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=797), constant:[-20]]) -> (%1267:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796)])
-            linalg.CPU.EqualOp <name="model.layers.21.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=798), outputs_0:QuantSpec(Raw(type: UInt8), uuid=799), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1268:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=798), constant:[-0.80859375]]) -> (%1269:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=799)])
-            linalg.CPU.WhereOp <name="model.layers.21.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=799), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796), )] (%1269:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=799)], %1264:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=794)], %1267:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796)]) -> (%1270:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796)])
-            linalg.CPU.SoftmaxOp <name="model.layers.21.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=800), )] (%1270:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=796)]) -> (%1271:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=800)])
-            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=800), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), )] (%1271:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=800)], %1261:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1272:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), )] (%1272:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)]) -> (%1273:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), )] (%1273:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)]) -> (%1273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=802))] (%1273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)]) -> (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)])
-            cf.ReturnOp (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=791)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=793)]) -> ()
+        (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) {
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.q_proj">(%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%8938:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=782))] (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%8939:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=784))] (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%8940:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), )] (%8938:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)]) -> (%8938:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), )] (%8938:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)]) -> (%8941:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), )] (%8939:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)]) -> (%8939:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), )] (%8939:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)]) -> (%8942:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), )] (%8940:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%8940:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), )] (%8940:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%8943:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=788))] (%8941:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)]) -> (%8944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=790))] (%8942:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)]) -> (%8945:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
+            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), )] (%8944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8946:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
+            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), )] (%8945:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8947:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), outputs_0:QuantSpec(Raw(type: Float16), uuid=791), )] (%8947:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)]) -> (%8948:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=791)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=791), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), )] (%8948:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=791)]) -> (%8949:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), )] (%8949:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)]) -> (%8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(Raw(type: Float16), uuid=793), )] (%8943:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%8951:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=793)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=793), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), )] (%8951:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=793)]) -> (%8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)]) -> (%8953:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> (%8954:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
+            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%8953:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%8955:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
+            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%8954:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8956:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
+            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), )] (%8946:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)], %8955:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%8957:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), inputs_1:QuantSpec(Raw(type: Float32), uuid=796), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), )] (%8957:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)], %8958:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=796), constant:[0.088388346]]) -> (%8959:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)])
+            linalg.CPU.ReduceMinOp <name="model.layers.21.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%8959:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)]) -> (%8960:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), inputs_1:QuantSpec(Raw(type: Int16), uuid=798), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%8960:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)], %8961:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=798), constant:[-20]]) -> (%8962:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
+            linalg.CPU.EqualOp <name="model.layers.21.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=799), outputs_0:QuantSpec(Raw(type: UInt8), uuid=800), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8963:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=799), constant:[-0.80859375]]) -> (%8964:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=800)])
+            linalg.CPU.WhereOp <name="model.layers.21.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=800), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%8964:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=800)], %8959:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)], %8962:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)]) -> (%8965:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
+            linalg.CPU.SoftmaxOp <name="model.layers.21.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), )] (%8965:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)]) -> (%8966:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)])
+            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), )] (%8966:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)], %8956:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8967:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), )] (%8967:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)]) -> (%8968:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), )] (%8968:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)]) -> (%8968:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803))] (%8968:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)]) -> (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
+            cf.ReturnOp (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.21.mlp <CPU> [using_qnn:true, symbol:model.layers.21.mlp] {
-        (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)]) {
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=806))] (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)])
-            linalg.CPU.SiLUOp <name="model.layers.21.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), )] (%1277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)]) -> (%1278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)])
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=809))] (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1279:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810)])
-            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), )] (%1278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)], %1279:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810)]) -> (%1280:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)])
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=811))] (%1280:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)]) -> (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)])
-            cf.ReturnOp (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)]) -> ()
+        (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) {
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807))] (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8972:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)])
+            linalg.CPU.SiLUOp <name="model.layers.21.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), )] (%8972:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)]) -> (%8973:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=810))] (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8974:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811)])
+            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), )] (%8973:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)], %8974:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811)]) -> (%8975:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=812))] (%8975:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)]) -> (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
+            cf.ReturnOp (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.22 <CPU> [using_qnn:true, symbol:model.layers.22] {
-        (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.22.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), )] (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)]) -> (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
-            graph.CallGraphOp @model.layers.22.self_attn (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)])
-            linalg.CPU.AddOp <name="model.layers.22.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), )] (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)], %1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)]) -> (%1316:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), )] (%1316:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)]) -> (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)])
-            graph.CallGraphOp @model.layers.22.mlp (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)])
-            linalg.CPU.AddOp <name="model.layers.22.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846), )] (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)], %1316:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)]) -> (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)])
-            cf.ReturnOp (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)]) -> ()
+        (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.22.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815))] (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)])
+            graph.CallGraphOp @model.layers.22.self_attn (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)])
+            linalg.CPU.AddOp <name="model.layers.22.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), )] (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%9011:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840))] (%9011:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)])
+            graph.CallGraphOp @model.layers.22.mlp (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
+            linalg.CPU.AddOp <name="model.layers.22.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), )] (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %9011:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
+            cf.ReturnOp (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.22.self_attn <CPU> [using_qnn:true, symbol:model.layers.22.self_attn] {
-        (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)]) {
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.q_proj">(%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%1284:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=819)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=815))] (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%1285:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=817))] (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%1286:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=819), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=819), )] (%1284:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=819)]) -> (%1284:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=819)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=819), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=819), )] (%1284:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=819)]) -> (%1287:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=819)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816), )] (%1285:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816)]) -> (%1285:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816), )] (%1285:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816)]) -> (%1288:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818), )] (%1286:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818)]) -> (%1286:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818), )] (%1286:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818)]) -> (%1289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820), )] (%1287:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=819)]) -> (%1290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), )] (%1288:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=816)]) -> (%1291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)])
-            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820), )] (%1290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1292:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820)])
-            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), )] (%1291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1293:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), outputs_0:QuantSpec(Raw(type: Float16), uuid=824), )] (%1293:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)]) -> (%1294:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=824)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=824), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825), )] (%1294:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=824)]) -> (%1295:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825), )] (%1295:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)]) -> (%1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818), outputs_0:QuantSpec(Raw(type: Float16), uuid=826), )] (%1289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=818)]) -> (%1297:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=826)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=826), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827), )] (%1297:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=826)]) -> (%1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)])
-            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)]) -> (%1299:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
-            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)]) -> (%1300:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
-            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%1299:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%1301:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
-            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%1300:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1302:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
-            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828), )] (%1292:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=820)], %1301:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%1303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828)])
-            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828), inputs_1:QuantSpec(Raw(type: Float32), uuid=829), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828), )] (%1303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828)], %1304:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=829), constant:[0.088388346]]) -> (%1305:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828)])
-            linalg.CPU.ReduceMinOp <name="model.layers.22.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830), )] (%1305:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828)]) -> (%1306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830)])
-            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830), inputs_1:QuantSpec(Raw(type: Int16), uuid=831), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830), )] (%1306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830)], %1307:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=831), constant:[-20]]) -> (%1308:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830)])
-            linalg.CPU.EqualOp <name="model.layers.22.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=832), outputs_0:QuantSpec(Raw(type: UInt8), uuid=833), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1309:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=832), constant:[-0.42773438]]) -> (%1310:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=833)])
-            linalg.CPU.WhereOp <name="model.layers.22.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=833), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830), )] (%1310:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=833)], %1305:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=828)], %1308:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830)]) -> (%1311:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830)])
-            linalg.CPU.SoftmaxOp <name="model.layers.22.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), )] (%1311:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=830)]) -> (%1312:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)])
-            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), )] (%1312:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)], %1302:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1313:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), )] (%1313:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)]) -> (%1314:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), )] (%1314:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)]) -> (%1314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=836))] (%1314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)]) -> (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)])
-            cf.ReturnOp (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=825)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=827)]) -> ()
+        (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) {
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.q_proj">(%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%8979:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=816))] (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%8980:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=818))] (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%8981:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), )] (%8979:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%8979:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), )] (%8979:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%8982:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), )] (%8980:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%8980:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), )] (%8980:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%8983:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), )] (%8981:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%8981:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), )] (%8981:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%8984:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=822))] (%8982:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%8985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=824))] (%8983:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%8986:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)])
+            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), )] (%8985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8987:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)])
+            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), )] (%8986:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8988:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), outputs_0:QuantSpec(Raw(type: Float16), uuid=825), )] (%8988:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)]) -> (%8989:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=825)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=825), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), )] (%8989:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=825)]) -> (%8990:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), )] (%8990:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> (%8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(Raw(type: Float16), uuid=827), )] (%8984:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%8992:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=827)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=827), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828), )] (%8992:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=827)]) -> (%8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> (%8994:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) -> (%8995:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
+            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%8994:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%8996:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
+            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%8995:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%8997:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
+            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%8987:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)], %8996:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%8998:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), inputs_1:QuantSpec(Raw(type: Float32), uuid=830), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%8998:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)], %8999:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=830), constant:[0.088388346]]) -> (%9000:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
+            linalg.CPU.ReduceMinOp <name="model.layers.22.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), )] (%9000:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)]) -> (%9001:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), inputs_1:QuantSpec(Raw(type: Int16), uuid=832), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), )] (%9001:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)], %9002:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=832), constant:[-20]]) -> (%9003:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)])
+            linalg.CPU.EqualOp <name="model.layers.22.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=833), outputs_0:QuantSpec(Raw(type: UInt8), uuid=834), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9004:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=833), constant:[-0.42773438]]) -> (%9005:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=834)])
+            linalg.CPU.WhereOp <name="model.layers.22.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=834), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), )] (%9005:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=834)], %9000:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)], %9003:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)]) -> (%9006:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)])
+            linalg.CPU.SoftmaxOp <name="model.layers.22.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), )] (%9006:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)]) -> (%9007:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)])
+            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%9007:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)], %8997:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9008:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%9008:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%9009:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%9009:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%9009:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=837))] (%9009:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)])
+            cf.ReturnOp (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.22.mlp <CPU> [using_qnn:true, symbol:model.layers.22.mlp] {
-        (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)]) {
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=841), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=840))] (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%1318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=841)])
-            linalg.CPU.SiLUOp <name="model.layers.22.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=841), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), )] (%1318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=841)]) -> (%1319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)])
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=843))] (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%1320:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)])
-            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), )] (%1319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)], %1320:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)]) -> (%1321:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)])
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=845))] (%1321:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)]) -> (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)])
-            cf.ReturnOp (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)]) -> ()
+        (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) {
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=841))] (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9013:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)])
+            linalg.CPU.SiLUOp <name="model.layers.22.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), )] (%9013:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)]) -> (%9014:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=844))] (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9015:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)])
+            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), )] (%9014:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)], %9015:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)]) -> (%9016:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=846))] (%9016:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)]) -> (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
+            cf.ReturnOp (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.23 <CPU> [using_qnn:true, symbol:model.layers.23] {
-        (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.23.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), )] (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)]) -> (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            graph.CallGraphOp @model.layers.23.self_attn (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)])
-            linalg.CPU.AddOp <name="model.layers.23.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871), )] (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)], %1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=846)]) -> (%1357:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), )] (%1357:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)]) -> (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)])
-            graph.CallGraphOp @model.layers.23.mlp (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)])
-            linalg.CPU.AddOp <name="model.layers.23.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880), )] (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)], %1357:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)]) -> (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)])
-            cf.ReturnOp (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)]) -> ()
+        (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.23.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849))] (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)])
+            graph.CallGraphOp @model.layers.23.self_attn (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)])
+            linalg.CPU.AddOp <name="model.layers.23.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), )] (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%9052:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874))] (%9052:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)])
+            graph.CallGraphOp @model.layers.23.mlp (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
+            linalg.CPU.AddOp <name="model.layers.23.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), )] (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9052:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
+            cf.ReturnOp (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.23.self_attn <CPU> [using_qnn:true, symbol:model.layers.23.self_attn] {
-        (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)]) {
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.q_proj">(%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%1325:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=853)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=849))] (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%1326:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=851))] (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%1327:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=853), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=853), )] (%1325:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=853)]) -> (%1325:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=853)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=853), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=853), )] (%1325:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=853)]) -> (%1328:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=853)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850), )] (%1326:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850)]) -> (%1326:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850), )] (%1326:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850)]) -> (%1329:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), )] (%1327:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)]) -> (%1327:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), )] (%1327:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)]) -> (%1330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=853), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854), )] (%1328:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=853)]) -> (%1331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856), )] (%1329:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=850)]) -> (%1332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856)])
-            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854), )] (%1331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1333:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854)])
-            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856), )] (%1332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1334:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856), outputs_0:QuantSpec(Raw(type: Float16), uuid=858), )] (%1334:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=856)]) -> (%1335:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=858)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=858), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859), )] (%1335:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=858)]) -> (%1336:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859), )] (%1336:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)]) -> (%1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), outputs_0:QuantSpec(Raw(type: Float16), uuid=860), )] (%1330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)]) -> (%1338:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=860)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=860), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861), )] (%1338:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=860)]) -> (%1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)])
-            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)]) -> (%1340:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
-            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)]) -> (%1341:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
-            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%1340:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%1342:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
-            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%1341:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1343:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
-            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862), )] (%1333:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=854)], %1342:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%1344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862)])
-            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862), inputs_1:QuantSpec(Raw(type: Float32), uuid=863), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862), )] (%1344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862)], %1345:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=863), constant:[0.088388346]]) -> (%1346:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862)])
-            linalg.CPU.ReduceMinOp <name="model.layers.23.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), )] (%1346:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862)]) -> (%1347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)])
-            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), inputs_1:QuantSpec(Raw(type: Int16), uuid=865), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), )] (%1347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)], %1348:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=865), constant:[-20]]) -> (%1349:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)])
-            linalg.CPU.EqualOp <name="model.layers.23.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=866), outputs_0:QuantSpec(Raw(type: UInt8), uuid=867), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1350:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=866), constant:[0.96484375]]) -> (%1351:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=867)])
-            linalg.CPU.WhereOp <name="model.layers.23.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=867), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), )] (%1351:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=867)], %1346:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=862)], %1349:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)]) -> (%1352:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)])
-            linalg.CPU.SoftmaxOp <name="model.layers.23.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=868), )] (%1352:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)]) -> (%1353:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=868)])
-            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=868), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), )] (%1353:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=868)], %1343:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1354:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), )] (%1354:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)]) -> (%1355:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), )] (%1355:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)]) -> (%1355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=870))] (%1355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)]) -> (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)])
-            cf.ReturnOp (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=871)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=859)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=861)]) -> ()
+        (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) {
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.q_proj">(%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)]) -> (%9020:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=850))] (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)]) -> (%9021:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=852))] (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)]) -> (%9022:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), )] (%9020:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)]) -> (%9020:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), )] (%9020:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)]) -> (%9023:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), )] (%9021:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)]) -> (%9021:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), )] (%9021:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)]) -> (%9024:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), )] (%9022:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)]) -> (%9022:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), )] (%9022:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)]) -> (%9025:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=856))] (%9023:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)]) -> (%9026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=858))] (%9024:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)]) -> (%9027:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)])
+            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), )] (%9026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9028:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)])
+            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), )] (%9027:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9029:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), outputs_0:QuantSpec(Raw(type: Float16), uuid=859), )] (%9029:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)]) -> (%9030:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=859)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=859), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), )] (%9030:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=859)]) -> (%9031:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), )] (%9031:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)]) -> (%9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), outputs_0:QuantSpec(Raw(type: Float16), uuid=861), )] (%9025:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)]) -> (%9033:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=861)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=861), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862), )] (%9033:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=861)]) -> (%9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)]) -> (%9035:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) -> (%9036:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
+            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%9035:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9037:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
+            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%9036:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9038:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
+            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), )] (%9028:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)], %9037:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9039:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), inputs_1:QuantSpec(Raw(type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), )] (%9039:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)], %9040:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=864), constant:[0.088388346]]) -> (%9041:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)])
+            linalg.CPU.ReduceMinOp <name="model.layers.23.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), )] (%9041:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)]) -> (%9042:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), inputs_1:QuantSpec(Raw(type: Int16), uuid=866), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), )] (%9042:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)], %9043:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=866), constant:[-20]]) -> (%9044:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)])
+            linalg.CPU.EqualOp <name="model.layers.23.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=867), outputs_0:QuantSpec(Raw(type: UInt8), uuid=868), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9045:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=867), constant:[0.96484375]]) -> (%9046:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=868)])
+            linalg.CPU.WhereOp <name="model.layers.23.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=868), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), )] (%9046:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=868)], %9041:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)], %9044:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)]) -> (%9047:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)])
+            linalg.CPU.SoftmaxOp <name="model.layers.23.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), )] (%9047:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)]) -> (%9048:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)])
+            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%9048:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)], %9038:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9049:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%9049:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%9050:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%9050:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%9050:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=871))] (%9050:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)])
+            cf.ReturnOp (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.23.mlp <CPU> [using_qnn:true, symbol:model.layers.23.mlp] {
-        (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)]) {
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=874))] (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%1359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)])
-            linalg.CPU.SiLUOp <name="model.layers.23.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), )] (%1359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)]) -> (%1360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)])
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=878), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=877))] (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%1361:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=878)])
-            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=878), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), )] (%1360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)], %1361:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=878)]) -> (%1362:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)])
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=879))] (%1362:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)]) -> (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)])
-            cf.ReturnOp (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)]) -> ()
+        (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) {
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875))] (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9054:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)])
+            linalg.CPU.SiLUOp <name="model.layers.23.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), )] (%9054:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)]) -> (%9055:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=878))] (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9056:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)])
+            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), )] (%9055:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)], %9056:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)]) -> (%9057:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=880))] (%9057:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)]) -> (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
+            cf.ReturnOp (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.24 <CPU> [using_qnn:true, symbol:model.layers.24] {
-        (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.24.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), )] (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)]) -> (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
-            graph.CallGraphOp @model.layers.24.self_attn (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)])
-            linalg.CPU.AddOp <name="model.layers.24.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), )] (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)], %1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=880)]) -> (%1398:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), )] (%1398:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)]) -> (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)])
-            graph.CallGraphOp @model.layers.24.mlp (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)])
-            linalg.CPU.AddOp <name="model.layers.24.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914), )] (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)], %1398:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)]) -> (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)])
-            cf.ReturnOp (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)]) -> ()
+        (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.24.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=883))] (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)])
+            graph.CallGraphOp @model.layers.24.self_attn (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)])
+            linalg.CPU.AddOp <name="model.layers.24.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), )] (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%9093:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=908))] (%9093:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)])
+            graph.CallGraphOp @model.layers.24.mlp (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
+            linalg.CPU.AddOp <name="model.layers.24.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), )] (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9093:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
+            cf.ReturnOp (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.24.self_attn <CPU> [using_qnn:true, symbol:model.layers.24.self_attn] {
-        (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)]) {
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.q_proj">(%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%1366:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=887)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=883))] (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%1367:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=885))] (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%1368:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=887), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=887), )] (%1366:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=887)]) -> (%1366:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=887)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=887), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=887), )] (%1366:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=887)]) -> (%1369:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=887)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884), )] (%1367:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884)]) -> (%1367:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884), )] (%1367:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884)]) -> (%1370:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886), )] (%1368:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886)]) -> (%1368:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886), )] (%1368:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886)]) -> (%1371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=887), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888), )] (%1369:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=887)]) -> (%1372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890), )] (%1370:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=884)]) -> (%1373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890)])
-            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888), )] (%1372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1374:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888)])
-            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890), )] (%1373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1375:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890), outputs_0:QuantSpec(Raw(type: Float16), uuid=892), )] (%1375:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=890)]) -> (%1376:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=892)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=892), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893), )] (%1376:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=892)]) -> (%1377:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893), )] (%1377:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)]) -> (%1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886), outputs_0:QuantSpec(Raw(type: Float16), uuid=894), )] (%1371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=886)]) -> (%1379:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=894)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895), )] (%1379:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=894)]) -> (%1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)])
-            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)]) -> (%1381:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
-            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)]) -> (%1382:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
-            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%1381:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%1383:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
-            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%1382:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1384:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
-            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), )] (%1374:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=888)], %1383:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%1385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)])
-            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), inputs_1:QuantSpec(Raw(type: Float32), uuid=897), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), )] (%1385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)], %1386:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=897), constant:[0.088388346]]) -> (%1387:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)])
-            linalg.CPU.ReduceMinOp <name="model.layers.24.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898), )] (%1387:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)]) -> (%1388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898)])
-            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898), inputs_1:QuantSpec(Raw(type: Int16), uuid=899), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898), )] (%1388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898)], %1389:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=899), constant:[-20]]) -> (%1390:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898)])
-            linalg.CPU.EqualOp <name="model.layers.24.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=900), outputs_0:QuantSpec(Raw(type: UInt8), uuid=901), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1391:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=900), constant:[0.07910156]]) -> (%1392:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=901)])
-            linalg.CPU.WhereOp <name="model.layers.24.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=901), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898), )] (%1392:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=901)], %1387:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)], %1390:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898)]) -> (%1393:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898)])
-            linalg.CPU.SoftmaxOp <name="model.layers.24.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902), )] (%1393:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=898)]) -> (%1394:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902)])
-            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), )] (%1394:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902)], %1384:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1395:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), )] (%1395:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)]) -> (%1396:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), )] (%1396:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)]) -> (%1396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=904))] (%1396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)]) -> (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)])
-            cf.ReturnOp (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=893)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=895)]) -> ()
+        (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) {
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.q_proj">(%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%9061:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=884))] (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%9062:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=886))] (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%9063:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), )] (%9061:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)]) -> (%9061:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), )] (%9061:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)]) -> (%9064:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), )] (%9062:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)]) -> (%9062:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), )] (%9062:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)]) -> (%9065:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), )] (%9063:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%9063:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), )] (%9063:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%9066:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=890))] (%9064:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)]) -> (%9067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=892))] (%9065:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)]) -> (%9068:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)])
+            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), )] (%9067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9069:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
+            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), )] (%9068:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9070:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), outputs_0:QuantSpec(Raw(type: Float16), uuid=893), )] (%9070:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)]) -> (%9071:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=893)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=893), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), )] (%9071:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=893)]) -> (%9072:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), )] (%9072:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)]) -> (%9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(Raw(type: Float16), uuid=895), )] (%9066:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%9074:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=895)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=895), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896), )] (%9074:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=895)]) -> (%9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)]) -> (%9076:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) -> (%9077:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
+            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%9076:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9078:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
+            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%9077:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9079:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
+            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), )] (%9069:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)], %9078:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9080:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), inputs_1:QuantSpec(Raw(type: Float32), uuid=898), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), )] (%9080:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)], %9081:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=898), constant:[0.088388346]]) -> (%9082:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)])
+            linalg.CPU.ReduceMinOp <name="model.layers.24.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), )] (%9082:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)]) -> (%9083:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), inputs_1:QuantSpec(Raw(type: Int16), uuid=900), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), )] (%9083:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)], %9084:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=900), constant:[-20]]) -> (%9085:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
+            linalg.CPU.EqualOp <name="model.layers.24.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=901), outputs_0:QuantSpec(Raw(type: UInt8), uuid=902), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9086:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=901), constant:[0.07910156]]) -> (%9087:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=902)])
+            linalg.CPU.WhereOp <name="model.layers.24.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=902), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), )] (%9087:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=902)], %9082:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)], %9085:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)]) -> (%9088:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
+            linalg.CPU.SoftmaxOp <name="model.layers.24.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), )] (%9088:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)]) -> (%9089:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)])
+            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%9089:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)], %9079:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9090:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%9090:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%9091:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%9091:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%9091:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=905))] (%9091:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)])
+            cf.ReturnOp (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.24.mlp <CPU> [using_qnn:true, symbol:model.layers.24.mlp] {
-        (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)]) {
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=909), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=908))] (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%1400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=909)])
-            linalg.CPU.SiLUOp <name="model.layers.24.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=909), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), )] (%1400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=909)]) -> (%1401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)])
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=912), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=911))] (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%1402:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=912)])
-            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=912), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), )] (%1401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)], %1402:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=912)]) -> (%1403:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)])
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=913))] (%1403:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)]) -> (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)])
-            cf.ReturnOp (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)]) -> ()
+        (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) {
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=909))] (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9095:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)])
+            linalg.CPU.SiLUOp <name="model.layers.24.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), )] (%9095:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)]) -> (%9096:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912))] (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9097:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913)])
+            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), )] (%9096:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)], %9097:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913)]) -> (%9098:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=914))] (%9098:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)]) -> (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
+            cf.ReturnOp (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.25 <CPU> [using_qnn:true, symbol:model.layers.25] {
-        (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.25.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), )] (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)]) -> (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
-            graph.CallGraphOp @model.layers.25.self_attn (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)])
-            linalg.CPU.AddOp <name="model.layers.25.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939), )] (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)], %1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=914)]) -> (%1439:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), )] (%1439:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)]) -> (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)])
-            graph.CallGraphOp @model.layers.25.mlp (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)])
-            linalg.CPU.AddOp <name="model.layers.25.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948), )] (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)], %1439:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)]) -> (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)])
-            cf.ReturnOp (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)]) -> ()
+        (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.25.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=917))] (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)])
+            graph.CallGraphOp @model.layers.25.self_attn (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)])
+            linalg.CPU.AddOp <name="model.layers.25.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), )] (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%9134:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=942))] (%9134:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)])
+            graph.CallGraphOp @model.layers.25.mlp (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
+            linalg.CPU.AddOp <name="model.layers.25.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), )] (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9134:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
+            cf.ReturnOp (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.25.self_attn <CPU> [using_qnn:true, symbol:model.layers.25.self_attn] {
-        (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)]) {
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.q_proj">(%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%1407:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=921)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=917))] (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%1408:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=919))] (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%1409:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=921), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=921), )] (%1407:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=921)]) -> (%1407:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=921)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=921), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=921), )] (%1407:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=921)]) -> (%1410:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=921)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918), )] (%1408:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918)]) -> (%1408:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918), )] (%1408:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918)]) -> (%1411:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920), )] (%1409:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920)]) -> (%1409:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920), )] (%1409:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920)]) -> (%1412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=921), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922), )] (%1410:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=921)]) -> (%1413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924), )] (%1411:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=918)]) -> (%1414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924)])
-            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922), )] (%1413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1415:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922)])
-            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924), )] (%1414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1416:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924), outputs_0:QuantSpec(Raw(type: Float16), uuid=926), )] (%1416:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=924)]) -> (%1417:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=926)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=926), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927), )] (%1417:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=926)]) -> (%1418:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927), )] (%1418:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)]) -> (%1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920), outputs_0:QuantSpec(Raw(type: Float16), uuid=928), )] (%1412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=920)]) -> (%1420:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=928)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=928), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929), )] (%1420:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=928)]) -> (%1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)])
-            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)]) -> (%1422:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
-            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)]) -> (%1423:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
-            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%1422:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%1424:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
-            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%1423:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1425:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
-            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930), )] (%1415:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=922)], %1424:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%1426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930)])
-            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930), inputs_1:QuantSpec(Raw(type: Float32), uuid=931), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930), )] (%1426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930)], %1427:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=931), constant:[0.088388346]]) -> (%1428:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930)])
-            linalg.CPU.ReduceMinOp <name="model.layers.25.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932), )] (%1428:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930)]) -> (%1429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932)])
-            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932), inputs_1:QuantSpec(Raw(type: Int16), uuid=933), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932), )] (%1429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932)], %1430:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=933), constant:[-20]]) -> (%1431:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932)])
-            linalg.CPU.EqualOp <name="model.layers.25.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=934), outputs_0:QuantSpec(Raw(type: UInt8), uuid=935), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1432:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=934), constant:[-0.9921875]]) -> (%1433:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=935)])
-            linalg.CPU.WhereOp <name="model.layers.25.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=935), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932), )] (%1433:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=935)], %1428:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=930)], %1431:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932)]) -> (%1434:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932)])
-            linalg.CPU.SoftmaxOp <name="model.layers.25.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=936), )] (%1434:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=932)]) -> (%1435:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=936)])
-            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=936), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), )] (%1435:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=936)], %1425:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1436:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), )] (%1436:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)]) -> (%1437:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), )] (%1437:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)]) -> (%1437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=938))] (%1437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)]) -> (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)])
-            cf.ReturnOp (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=939)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=927)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=929)]) -> ()
+        (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) {
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.q_proj">(%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)]) -> (%9102:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=918))] (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)]) -> (%9103:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=920))] (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)]) -> (%9104:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), )] (%9102:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)]) -> (%9102:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), )] (%9102:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)]) -> (%9105:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), )] (%9103:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)]) -> (%9103:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), )] (%9103:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)]) -> (%9106:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), )] (%9104:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)]) -> (%9104:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), )] (%9104:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)]) -> (%9107:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=924))] (%9105:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)]) -> (%9108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=926))] (%9106:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)]) -> (%9109:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)])
+            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), )] (%9108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9110:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)])
+            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), )] (%9109:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9111:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), outputs_0:QuantSpec(Raw(type: Float16), uuid=927), )] (%9111:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)]) -> (%9112:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=927)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=927), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), )] (%9112:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=927)]) -> (%9113:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), )] (%9113:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)]) -> (%9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), outputs_0:QuantSpec(Raw(type: Float16), uuid=929), )] (%9107:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)]) -> (%9115:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=929)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=929), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930), )] (%9115:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=929)]) -> (%9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)]) -> (%9117:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) -> (%9118:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
+            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%9117:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9119:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
+            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%9118:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9120:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
+            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), )] (%9110:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)], %9119:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9121:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), inputs_1:QuantSpec(Raw(type: Float32), uuid=932), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), )] (%9121:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)], %9122:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=932), constant:[0.088388346]]) -> (%9123:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)])
+            linalg.CPU.ReduceMinOp <name="model.layers.25.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), )] (%9123:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)]) -> (%9124:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), inputs_1:QuantSpec(Raw(type: Int16), uuid=934), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), )] (%9124:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)], %9125:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=934), constant:[-20]]) -> (%9126:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)])
+            linalg.CPU.EqualOp <name="model.layers.25.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=935), outputs_0:QuantSpec(Raw(type: UInt8), uuid=936), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9127:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=935), constant:[-0.9921875]]) -> (%9128:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=936)])
+            linalg.CPU.WhereOp <name="model.layers.25.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=936), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), )] (%9128:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=936)], %9123:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)], %9126:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)]) -> (%9129:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)])
+            linalg.CPU.SoftmaxOp <name="model.layers.25.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), )] (%9129:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)]) -> (%9130:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)])
+            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), )] (%9130:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)], %9120:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9131:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), )] (%9131:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)]) -> (%9132:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), )] (%9132:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)]) -> (%9132:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939))] (%9132:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)]) -> (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)])
+            cf.ReturnOp (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.25.mlp <CPU> [using_qnn:true, symbol:model.layers.25.mlp] {
-        (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)]) {
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=943), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=942))] (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%1441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=943)])
-            linalg.CPU.SiLUOp <name="model.layers.25.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=943), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), )] (%1441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=943)]) -> (%1442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)])
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=946), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=945))] (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%1443:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=946)])
-            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=946), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), )] (%1442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)], %1443:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=946)]) -> (%1444:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)])
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=947))] (%1444:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)]) -> (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)])
-            cf.ReturnOp (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)]) -> ()
+        (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) {
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943))] (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9136:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)])
+            linalg.CPU.SiLUOp <name="model.layers.25.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), )] (%9136:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)]) -> (%9137:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=946))] (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9138:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947)])
+            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), )] (%9137:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)], %9138:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947)]) -> (%9139:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=948))] (%9139:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)]) -> (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
+            cf.ReturnOp (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.26 <CPU> [using_qnn:true, symbol:model.layers.26] {
-        (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.26.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), )] (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)]) -> (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
-            graph.CallGraphOp @model.layers.26.self_attn (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)])
-            linalg.CPU.AddOp <name="model.layers.26.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973), )] (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)], %1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=948)]) -> (%1480:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), )] (%1480:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)]) -> (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)])
-            graph.CallGraphOp @model.layers.26.mlp (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)])
-            linalg.CPU.AddOp <name="model.layers.26.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982), )] (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)], %1480:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)]) -> (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)])
-            cf.ReturnOp (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)]) -> ()
+        (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.26.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=951))] (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)])
+            graph.CallGraphOp @model.layers.26.self_attn (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)])
+            linalg.CPU.AddOp <name="model.layers.26.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), )] (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%9175:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=976))] (%9175:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)])
+            graph.CallGraphOp @model.layers.26.mlp (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
+            linalg.CPU.AddOp <name="model.layers.26.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), )] (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9175:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
+            cf.ReturnOp (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.26.self_attn <CPU> [using_qnn:true, symbol:model.layers.26.self_attn] {
-        (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)]) {
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.q_proj">(%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%1448:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=955)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=951))] (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%1449:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=953))] (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%1450:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=955), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=955), )] (%1448:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=955)]) -> (%1448:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=955)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=955), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=955), )] (%1448:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=955)]) -> (%1451:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=955)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952), )] (%1449:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952)]) -> (%1449:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952), )] (%1449:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952)]) -> (%1452:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954), )] (%1450:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954)]) -> (%1450:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954), )] (%1450:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954)]) -> (%1453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=955), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956), )] (%1451:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=955)]) -> (%1454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958), )] (%1452:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=952)]) -> (%1455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958)])
-            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956), )] (%1454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1456:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956)])
-            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958), )] (%1455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1457:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958), outputs_0:QuantSpec(Raw(type: Float16), uuid=960), )] (%1457:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=958)]) -> (%1458:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=960)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=960), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961), )] (%1458:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=960)]) -> (%1459:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961), )] (%1459:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)]) -> (%1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954), outputs_0:QuantSpec(Raw(type: Float16), uuid=962), )] (%1453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=954)]) -> (%1461:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=962)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=962), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963), )] (%1461:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=962)]) -> (%1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)])
-            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)]) -> (%1463:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
-            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)]) -> (%1464:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
-            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%1463:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%1465:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
-            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%1464:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1466:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
-            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964), )] (%1456:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=956)], %1465:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%1467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964)])
-            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964), inputs_1:QuantSpec(Raw(type: Float32), uuid=965), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964), )] (%1467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964)], %1468:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=965), constant:[0.088388346]]) -> (%1469:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964)])
-            linalg.CPU.ReduceMinOp <name="model.layers.26.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966), )] (%1469:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964)]) -> (%1470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966)])
-            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966), inputs_1:QuantSpec(Raw(type: Int16), uuid=967), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966), )] (%1470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966)], %1471:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=967), constant:[-20]]) -> (%1472:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966)])
-            linalg.CPU.EqualOp <name="model.layers.26.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=968), outputs_0:QuantSpec(Raw(type: UInt8), uuid=969), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1473:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=968), constant:[0.27929688]]) -> (%1474:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=969)])
-            linalg.CPU.WhereOp <name="model.layers.26.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=969), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966), )] (%1474:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=969)], %1469:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=964)], %1472:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966)]) -> (%1475:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966)])
-            linalg.CPU.SoftmaxOp <name="model.layers.26.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=970), )] (%1475:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=966)]) -> (%1476:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=970)])
-            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=970), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), )] (%1476:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=970)], %1466:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1477:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), )] (%1477:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)]) -> (%1478:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), )] (%1478:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)]) -> (%1478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=972))] (%1478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)]) -> (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)])
-            cf.ReturnOp (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=973)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=961)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=963)]) -> ()
+        (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) {
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.q_proj">(%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)]) -> (%9143:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=952))] (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)]) -> (%9144:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=954))] (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)]) -> (%9145:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), )] (%9143:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)]) -> (%9143:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), )] (%9143:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)]) -> (%9146:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), )] (%9144:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)]) -> (%9144:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), )] (%9144:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)]) -> (%9147:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), )] (%9145:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)]) -> (%9145:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), )] (%9145:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)]) -> (%9148:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=958))] (%9146:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)]) -> (%9149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=960))] (%9147:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)]) -> (%9150:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)])
+            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), )] (%9149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9151:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)])
+            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), )] (%9150:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9152:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), outputs_0:QuantSpec(Raw(type: Float16), uuid=961), )] (%9152:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)]) -> (%9153:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=961)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=961), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), )] (%9153:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=961)]) -> (%9154:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), )] (%9154:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)]) -> (%9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), outputs_0:QuantSpec(Raw(type: Float16), uuid=963), )] (%9148:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)]) -> (%9156:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=963)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=963), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964), )] (%9156:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=963)]) -> (%9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)]) -> (%9158:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) -> (%9159:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
+            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%9158:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9160:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
+            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%9159:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9161:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
+            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), )] (%9151:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)], %9160:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9162:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), inputs_1:QuantSpec(Raw(type: Float32), uuid=966), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), )] (%9162:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)], %9163:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=966), constant:[0.088388346]]) -> (%9164:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)])
+            linalg.CPU.ReduceMinOp <name="model.layers.26.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), )] (%9164:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)]) -> (%9165:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), inputs_1:QuantSpec(Raw(type: Int16), uuid=968), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), )] (%9165:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)], %9166:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=968), constant:[-20]]) -> (%9167:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)])
+            linalg.CPU.EqualOp <name="model.layers.26.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=969), outputs_0:QuantSpec(Raw(type: UInt8), uuid=970), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9168:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=969), constant:[0.27929688]]) -> (%9169:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=970)])
+            linalg.CPU.WhereOp <name="model.layers.26.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=970), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), )] (%9169:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=970)], %9164:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)], %9167:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)]) -> (%9170:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)])
+            linalg.CPU.SoftmaxOp <name="model.layers.26.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), )] (%9170:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)]) -> (%9171:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)])
+            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), )] (%9171:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)], %9161:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9172:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), )] (%9172:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)]) -> (%9173:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), )] (%9173:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)]) -> (%9173:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=973))] (%9173:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)]) -> (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)])
+            cf.ReturnOp (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.26.mlp <CPU> [using_qnn:true, symbol:model.layers.26.mlp] {
-        (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)]) {
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=977), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=976))] (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%1482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=977)])
-            linalg.CPU.SiLUOp <name="model.layers.26.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=977), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), )] (%1482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=977)]) -> (%1483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)])
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=980), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=979))] (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%1484:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=980)])
-            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=980), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), )] (%1483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)], %1484:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=980)]) -> (%1485:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)])
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=981))] (%1485:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)]) -> (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)])
-            cf.ReturnOp (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)]) -> ()
+        (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) {
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=977))] (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9177:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)])
+            linalg.CPU.SiLUOp <name="model.layers.26.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), )] (%9177:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)]) -> (%9178:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980))] (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9179:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981)])
+            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), )] (%9178:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)], %9179:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981)]) -> (%9180:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982))] (%9180:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)]) -> (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
+            cf.ReturnOp (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.27 <CPU> [using_qnn:true, symbol:model.layers.27] {
-        (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.27.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), )] (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)]) -> (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
-            graph.CallGraphOp @model.layers.27.self_attn (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)])
-            linalg.CPU.AddOp <name="model.layers.27.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007), )] (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)], %1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=982)]) -> (%1521:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), )] (%1521:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)]) -> (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)])
-            graph.CallGraphOp @model.layers.27.mlp (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)])
-            linalg.CPU.AddOp <name="model.layers.27.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016), )] (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)], %1521:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)]) -> (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)])
-            cf.ReturnOp (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) -> ()
+        (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.27.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=985))] (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)])
+            graph.CallGraphOp @model.layers.27.self_attn (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
+            linalg.CPU.AddOp <name="model.layers.27.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), )] (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%9216:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1010))] (%9216:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)])
+            graph.CallGraphOp @model.layers.27.mlp (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
+            linalg.CPU.AddOp <name="model.layers.27.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), )] (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9216:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
+            cf.ReturnOp (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.27.self_attn <CPU> [using_qnn:true, symbol:model.layers.27.self_attn] {
-        (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) {
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.q_proj">(%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%1489:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=989)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=985))] (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%1490:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=987))] (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%1491:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=989), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=989), )] (%1489:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=989)]) -> (%1489:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=989)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=989), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=989), )] (%1489:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=989)]) -> (%1492:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=989)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986), )] (%1490:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986)]) -> (%1490:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986), )] (%1490:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986)]) -> (%1493:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988), )] (%1491:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988)]) -> (%1491:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988), )] (%1491:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988)]) -> (%1494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=989), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990), )] (%1492:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=989)]) -> (%1495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992), )] (%1493:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=986)]) -> (%1496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992)])
-            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990), )] (%1495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1497:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990)])
-            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992), )] (%1496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1498:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992), outputs_0:QuantSpec(Raw(type: Float16), uuid=994), )] (%1498:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=992)]) -> (%1499:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=994)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=994), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995), )] (%1499:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=994)]) -> (%1500:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995), )] (%1500:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)]) -> (%1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988), outputs_0:QuantSpec(Raw(type: Float16), uuid=996), )] (%1494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=988)]) -> (%1502:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=996)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=996), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997), )] (%1502:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=996)]) -> (%1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)])
-            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)]) -> (%1504:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
-            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) -> (%1505:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
-            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%1504:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%1506:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
-            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%1505:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1507:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
-            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998), )] (%1497:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=990)], %1506:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%1508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998)])
-            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998), inputs_1:QuantSpec(Raw(type: Float32), uuid=999), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998), )] (%1508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998)], %1509:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=999), constant:[0.088388346]]) -> (%1510:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998)])
-            linalg.CPU.ReduceMinOp <name="model.layers.27.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000), )] (%1510:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998)]) -> (%1511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000)])
-            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000), inputs_1:QuantSpec(Raw(type: Int16), uuid=1001), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000), )] (%1511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000)], %1512:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=1001), constant:[-20]]) -> (%1513:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000)])
-            linalg.CPU.EqualOp <name="model.layers.27.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=1002), outputs_0:QuantSpec(Raw(type: UInt8), uuid=1003), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1514:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=1002), constant:[0.890625]]) -> (%1515:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1003)])
-            linalg.CPU.WhereOp <name="model.layers.27.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=1003), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000), )] (%1515:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1003)], %1510:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=998)], %1513:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000)]) -> (%1516:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000)])
-            linalg.CPU.SoftmaxOp <name="model.layers.27.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1004), )] (%1516:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1000)]) -> (%1517:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1004)])
-            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1004), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), )] (%1517:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1004)], %1507:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1518:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), )] (%1518:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)]) -> (%1519:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), )] (%1519:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)]) -> (%1519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1006))] (%1519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)]) -> (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)])
-            cf.ReturnOp (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1007)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=995)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=997)]) -> ()
+        (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) {
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.q_proj">(%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)]) -> (%9184:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=986))] (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)]) -> (%9185:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=988))] (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)]) -> (%9186:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), )] (%9184:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)]) -> (%9184:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), )] (%9184:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)]) -> (%9187:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), )] (%9185:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)]) -> (%9185:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), )] (%9185:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)]) -> (%9188:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), )] (%9186:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)]) -> (%9186:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), )] (%9186:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)]) -> (%9189:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=992))] (%9187:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)]) -> (%9190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=994))] (%9188:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)]) -> (%9191:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)])
+            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), )] (%9190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9192:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)])
+            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), )] (%9191:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9193:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), outputs_0:QuantSpec(Raw(type: Float16), uuid=995), )] (%9193:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)]) -> (%9194:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=995)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=995), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), )] (%9194:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=995)]) -> (%9195:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), )] (%9195:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)]) -> (%9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), outputs_0:QuantSpec(Raw(type: Float16), uuid=997), )] (%9189:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)]) -> (%9197:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=997)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=997), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998), )] (%9197:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=997)]) -> (%9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)]) -> (%9199:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> (%9200:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
+            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%9199:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9201:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
+            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%9200:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9202:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
+            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), )] (%9192:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)], %9201:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9203:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), inputs_1:QuantSpec(Raw(type: Float32), uuid=1000), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), )] (%9203:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)], %9204:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=1000), constant:[0.088388346]]) -> (%9205:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)])
+            linalg.CPU.ReduceMinOp <name="model.layers.27.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), )] (%9205:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)]) -> (%9206:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), inputs_1:QuantSpec(Raw(type: Int16), uuid=1002), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), )] (%9206:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)], %9207:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=1002), constant:[-20]]) -> (%9208:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)])
+            linalg.CPU.EqualOp <name="model.layers.27.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=1003), outputs_0:QuantSpec(Raw(type: UInt8), uuid=1004), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9209:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=1003), constant:[0.890625]]) -> (%9210:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1004)])
+            linalg.CPU.WhereOp <name="model.layers.27.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=1004), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), )] (%9210:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1004)], %9205:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)], %9208:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)]) -> (%9211:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)])
+            linalg.CPU.SoftmaxOp <name="model.layers.27.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), )] (%9211:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)]) -> (%9212:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)])
+            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), )] (%9212:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)], %9202:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9213:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), )] (%9213:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)]) -> (%9214:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), )] (%9214:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)]) -> (%9214:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1007))] (%9214:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)]) -> (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)])
+            cf.ReturnOp (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.27.mlp <CPU> [using_qnn:true, symbol:model.layers.27.mlp] {
-        (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)]) {
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1011), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1010))] (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%1523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1011)])
-            linalg.CPU.SiLUOp <name="model.layers.27.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1011), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), )] (%1523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1011)]) -> (%1524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)])
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1014), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1013))] (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%1525:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1014)])
-            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1014), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), )] (%1524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)], %1525:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1014)]) -> (%1526:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)])
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1015))] (%1526:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)]) -> (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)])
-            cf.ReturnOp (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1016)]) -> ()
+        (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) {
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1011))] (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9218:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)])
+            linalg.CPU.SiLUOp <name="model.layers.27.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), )] (%9218:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)]) -> (%9219:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1014))] (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9220:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015)])
+            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), )] (%9219:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)], %9220:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015)]) -> (%9221:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1016))] (%9221:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)]) -> (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
+            cf.ReturnOp (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) -> ()
         }
     }
     //     ╔═════╗   
diff --git a/mllm/backends/cpu/CPUBackend.cpp b/mllm/backends/cpu/CPUBackend.cpp
index a63fcd366..f8a3d8d1c 100644
--- a/mllm/backends/cpu/CPUBackend.cpp
+++ b/mllm/backends/cpu/CPUBackend.cpp
@@ -50,6 +50,7 @@
 #include "mllm/backends/cpu/ops/STFTOp.hpp"
 #include "mllm/backends/cpu/ops/Scatter2ShardsOp.hpp"
 #include "mllm/backends/cpu/ops/SiLUOp.hpp"
+#include "mllm/backends/cpu/ops/SigmoidOp.hpp"
 #include "mllm/backends/cpu/ops/SliceOp.hpp"
 #include "mllm/backends/cpu/ops/SoftmaxOp.hpp"
 #include "mllm/backends/cpu/ops/SplitOp.hpp"
@@ -73,11 +74,11 @@ CPUBackend::CPUBackend() : Backend(kCPU, createCPUAllocator()) {
                CPUReduceSumOpFactory, CPUTransposeOpFactory, CPUPermuteOpFactory, CPUCastTypeOpFactory, CPUConcatOpFactory,
                CPUStackOpFactory, CPUContiguousOpFactory, CPUCopyOpFactory, CPUEmbeddingOpFactory, CPUSplitOpFactory,
                CPUViewOpFactory, CPULayerNormOpFactory, CPURepeatOpFactory, CPUX2XOpFactory, CPUSoftmaxOpFactory,
-               CPUSiLUOpFactory, CPURMSNormOpFactory, CPUGELUOpFactory, CPUQuickGELUOpFactory, CPUReLUOpFactory,
-               CPUMatMulOpFactory, CPUFlashAttention2OpFactory, CPUSliceOpFactory, CPUVisionRoPEOpFactory, CPUParamOpFactory,
-               CPUMultimodalRoPEOpFactory, CPURoPEOpFactory, CPUCausalMaskOpFactory, CPUConv1DOpFactory, CPUConv3DOpFactory,
-               CPUSTFTOpFactory, CPUISTFTOpFactory, CPUIndexOpFactory, CPUTopKOpFactory, CPUClipOpFactory, CPUMeanOpFactory,
-               CPUKVCacheOpFactory, CPUPagedAttnOpFactory, CPUScatter2ShardsOpFactory, CPURadixAttnOpFactory,
+               CPUSiLUOpFactory, CPUSigmoidOpFactory, CPURMSNormOpFactory, CPUGELUOpFactory, CPUQuickGELUOpFactory,
+               CPUReLUOpFactory, CPUMatMulOpFactory, CPUFlashAttention2OpFactory, CPUSliceOpFactory, CPUVisionRoPEOpFactory,
+               CPUParamOpFactory, CPUMultimodalRoPEOpFactory, CPURoPEOpFactory, CPUCausalMaskOpFactory, CPUConv1DOpFactory,
+               CPUConv3DOpFactory, CPUSTFTOpFactory, CPUISTFTOpFactory, CPUIndexOpFactory, CPUTopKOpFactory, CPUClipOpFactory,
+               CPUMeanOpFactory, CPUKVCacheOpFactory, CPUPagedAttnOpFactory, CPUScatter2ShardsOpFactory, CPURadixAttnOpFactory,
                CPUConv2DOpFactory, CPULayerNorm2DOpFactory, CPUInterpolateOpFactory, CPUPadOpFactory, CPUMaskedScatterOpFactory,
                CPUArgsortOpFactory, CPUCloneOpFactory, CPUAvgPool1dOpFactory, CPUFlashAttention2SwaSinkOpFactory,
                CPURadixAttnRelaxOpFactory, CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory>();
diff --git a/mllm/backends/cpu/kernels/Kernels.hpp b/mllm/backends/cpu/kernels/Kernels.hpp
index 3d3ee9c8e..e8c05dfac 100644
--- a/mllm/backends/cpu/kernels/Kernels.hpp
+++ b/mllm/backends/cpu/kernels/Kernels.hpp
@@ -10,6 +10,7 @@
 #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86)
 #include "mllm/backends/cpu/kernels/x86/fill.hpp"     // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/x86/silu.hpp"     // IWYU pragma: export
+#include "mllm/backends/cpu/kernels/x86/sigmoid.hpp"  // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/x86/softmax.hpp"  // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/x86/rmsnorm.hpp"  // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/x86/gelu.hpp"     // IWYU pragma: export
@@ -22,6 +23,7 @@
 #include "mllm/backends/cpu/kernels/arm/transpose.hpp"                  // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/arm/permute.hpp"                    // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/arm/silu.hpp"                       // IWYU pragma: export
+#include "mllm/backends/cpu/kernels/arm/sigmoid.hpp"                    // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/arm/cast_types.hpp"                 // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/arm/layernorm.hpp"                  // IWYU pragma: export
 #include "mllm/backends/cpu/kernels/arm/softmax.hpp"                    // IWYU pragma: export
diff --git a/mllm/backends/cpu/kernels/arm/sigmoid.cpp b/mllm/backends/cpu/kernels/arm/sigmoid.cpp
new file mode 100644
index 000000000..8f18f5df8
--- /dev/null
+++ b/mllm/backends/cpu/kernels/arm/sigmoid.cpp
@@ -0,0 +1,131 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/cpu/kernels/arm/sigmoid.hpp"
+#include "mllm/core/Parallel.hpp"
+
+#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)
+
+#include "mllm/backends/cpu/kernels/arm/math.hpp"
+
+namespace mllm::cpu::arm {
+
+void sigmoid_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, int len, int thread_count) {
+  if (thread_count > 1) {
+    int tails = len % 16;
+    int _16_loops = len < 16 ? 0 : len - tails;
+    MLLM_AUTO_PARALLEL_FOR_BEGIN_NT(i, 0, _16_loops, 16, thread_count) {
+      float32x4_t x_line_0 = vld1q_f32(X + i);
+      float32x4_t ans_line_0 = vsigmoid_f32(x_line_0);
+      vst1q_f32(Y + i, ans_line_0);
+
+      float32x4_t x_line_1 = vld1q_f32(X + i + 4);
+      float32x4_t ans_line_1 = vsigmoid_f32(x_line_1);
+      vst1q_f32(Y + i + 4, ans_line_1);
+
+      float32x4_t x_line_2 = vld1q_f32(X + i + 8);
+      float32x4_t ans_line_2 = vsigmoid_f32(x_line_2);
+      vst1q_f32(Y + i + 8, ans_line_2);
+
+      float32x4_t x_line_3 = vld1q_f32(X + i + 12);
+      float32x4_t ans_line_3 = vsigmoid_f32(x_line_3);
+      vst1q_f32(Y + i + 12, ans_line_3);
+    }
+    MLLM_AUTO_PARALLEL_FOR_END_NT()
+    int i = _16_loops;
+    for (; i <= len - 8; i += 8) {
+      float32x4_t x_line_0 = vld1q_f32(X + i);
+      float32x4_t ans_line_0 = vsigmoid_f32(x_line_0);
+      vst1q_f32(Y + i, ans_line_0);
+
+      float32x4_t x_line_1 = vld1q_f32(X + i + 4);
+      float32x4_t ans_line_1 = vsigmoid_f32(x_line_1);
+      vst1q_f32(Y + i + 4, ans_line_1);
+    }
+    for (; i <= len - 4; i += 4) {
+      float32x4_t x_line_0 = vld1q_f32(X + i);
+      float32x4_t ans_line_0 = vsigmoid_f32(x_line_0);
+      vst1q_f32(Y + i, ans_line_0);
+    }
+    for (; i < len; i++) { Y[i] = 1.0f / (1.0f + std::exp(-X[i])); }
+  } else {
+    int i;
+    for (i = 0; i <= len - 16; i += 16) {
+      float32x4_t x_line_0 = vld1q_f32(X + i);
+      float32x4_t ans_line_0 = vsigmoid_f32(x_line_0);
+      vst1q_f32(Y + i, ans_line_0);
+
+      float32x4_t x_line_1 = vld1q_f32(X + i + 4);
+      float32x4_t ans_line_1 = vsigmoid_f32(x_line_1);
+      vst1q_f32(Y + i + 4, ans_line_1);
+
+      float32x4_t x_line_2 = vld1q_f32(X + i + 8);
+      float32x4_t ans_line_2 = vsigmoid_f32(x_line_2);
+      vst1q_f32(Y + i + 8, ans_line_2);
+
+      float32x4_t x_line_3 = vld1q_f32(X + i + 12);
+      float32x4_t ans_line_3 = vsigmoid_f32(x_line_3);
+      vst1q_f32(Y + i + 12, ans_line_3);
+    }
+    for (; i <= len - 8; i += 8) {
+      float32x4_t x_line_0 = vld1q_f32(X + i);
+      float32x4_t ans_line_0 = vsigmoid_f32(x_line_0);
+      vst1q_f32(Y + i, ans_line_0);
+
+      float32x4_t x_line_1 = vld1q_f32(X + i + 4);
+      float32x4_t ans_line_1 = vsigmoid_f32(x_line_1);
+      vst1q_f32(Y + i + 4, ans_line_1);
+    }
+    for (; i <= len - 4; i += 4) {
+      float32x4_t x_line_0 = vld1q_f32(X + i);
+      float32x4_t ans_line_0 = vsigmoid_f32(x_line_0);
+      vst1q_f32(Y + i, ans_line_0);
+    }
+    for (; i < len; i++) { Y[i] = 1.0f / (1.0f + std::exp(-X[i])); }
+  }
+}
+
+void sigmoid_fp16(const mllm_fp16_t* __restrict X, mllm_fp16_t* __restrict Y, int len, int thread_count) {
+  if (thread_count > 1) {
+    int tails = len % 16;
+    int _16_loops = len < 16 ? 0 : len - tails;
+    MLLM_AUTO_PARALLEL_FOR_BEGIN_NT(i, 0, _16_loops, 16, thread_count) {
+      float16x8_t x_line_0 = vld1q_f16(X + i);
+      float16x8_t ans_line_0 = vsigmoid_f16(x_line_0);
+      vst1q_f16(Y + i, ans_line_0);
+
+      float16x8_t x_line_1 = vld1q_f16(X + i + 8);
+      float16x8_t ans_line_1 = vsigmoid_f16(x_line_1);
+      vst1q_f16(Y + i + 8, ans_line_1);
+    }
+    MLLM_AUTO_PARALLEL_FOR_END_NT()
+    int i = _16_loops;
+    for (; i <= len - 8; i += 8) {
+      float16x8_t x_line_0 = vld1q_f16(X + i);
+      float16x8_t ans_line_0 = vsigmoid_f16(x_line_0);
+      vst1q_f16(Y + i, ans_line_0);
+    }
+    for (; i < len; i++) { Y[i] = 1.0f / (1.0f + std::exp(-static_cast<float>(X[i]))); }
+  } else {
+    int i;
+    for (i = 0; i <= len - 16; i += 16) {
+      float16x8_t x_line_0 = vld1q_f16(X + i);
+      float16x8_t ans_line_0 = vsigmoid_f16(x_line_0);
+      vst1q_f16(Y + i, ans_line_0);
+
+      float16x8_t x_line_1 = vld1q_f16(X + i + 8);
+      float16x8_t ans_line_1 = vsigmoid_f16(x_line_1);
+      vst1q_f16(Y + i + 8, ans_line_1);
+    }
+    for (; i <= len - 8; i += 8) {
+      float16x8_t x_line_0 = vld1q_f16(X + i);
+      float16x8_t ans_line_0 = vsigmoid_f16(x_line_0);
+      vst1q_f16(Y + i, ans_line_0);
+    }
+    for (; i < len; i++) { Y[i] = 1.0f / (1.0f + std::exp(-static_cast<float>(X[i]))); }
+  }
+}
+
+}  // namespace mllm::cpu::arm
+
+#endif
diff --git a/mllm/backends/cpu/kernels/arm/sigmoid.hpp b/mllm/backends/cpu/kernels/arm/sigmoid.hpp
new file mode 100644
index 000000000..70ca78d4b
--- /dev/null
+++ b/mllm/backends/cpu/kernels/arm/sigmoid.hpp
@@ -0,0 +1,18 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+#pragma once
+
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/utils/CPUArchHelper.hpp"
+
+#if defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)
+
+namespace mllm::cpu::arm {
+
+void sigmoid_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, int len, int thread_count);
+
+void sigmoid_fp16(const mllm_fp16_t* __restrict X, mllm_fp16_t* __restrict Y, int len, int thread_count);
+
+}  // namespace mllm::cpu::arm
+
+#endif
diff --git a/mllm/backends/cpu/kernels/x86/sigmoid.cpp b/mllm/backends/cpu/kernels/x86/sigmoid.cpp
new file mode 100644
index 000000000..53a5fe84d
--- /dev/null
+++ b/mllm/backends/cpu/kernels/x86/sigmoid.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/cpu/kernels/x86/sigmoid.hpp"
+#include "mllm/core/Parallel.hpp"
+
+#if defined(MLLM_HOST_ARCH_X86) || defined(MLLM_HOST_ARCH_X86_64)
+
+#include "mllm/backends/cpu/kernels/common/sigmoid-inl.hpp"
+#include <hwy/highway.h>
+
+namespace mllm::cpu::x86 {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+void sigmoid_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, int len, int thread_count) {
+  using D = hn::ScalableTag<float>;
+  const D d;
+  const auto vector_size = hn::Lanes(d);
+  const int aligned_len = len - (len % vector_size);
+
+  if (thread_count > 1) {
+    MLLM_AUTO_PARALLEL_FOR_BEGIN_NT(i, 0, aligned_len, vector_size, thread_count) {
+      auto x = hn::LoadU(d, X + i);
+      auto result = mllm::cpu::common::HWY_NAMESPACE::__sigmoid_fp32_vector(d, x);
+      hn::StoreU(result, d, Y + i);
+    }
+    MLLM_AUTO_PARALLEL_FOR_END_NT()
+
+    // Handle remaining elements
+    for (int i = aligned_len; i < len; ++i) { Y[i] = 1.0f / (1.0f + std::exp(-X[i])); }
+  } else {
+    int i = 0;
+    for (; i + vector_size <= len; i += vector_size) {
+      auto x = hn::LoadU(d, X + i);
+      auto result = mllm::cpu::common::HWY_NAMESPACE::__sigmoid_fp32_vector(d, x);
+      hn::StoreU(result, d, Y + i);
+    }
+
+    // Handle remaining elements
+    for (; i < len; ++i) { Y[i] = 1.0f / (1.0f + std::exp(-X[i])); }
+  }
+}
+
+}  // namespace mllm::cpu::x86
+
+#endif
diff --git a/mllm/backends/cpu/kernels/x86/sigmoid.hpp b/mllm/backends/cpu/kernels/x86/sigmoid.hpp
new file mode 100644
index 000000000..96b719e8e
--- /dev/null
+++ b/mllm/backends/cpu/kernels/x86/sigmoid.hpp
@@ -0,0 +1,16 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+#pragma once
+
+#include "mllm/core/DataTypes.hpp"
+#include "mllm/utils/CPUArchHelper.hpp"
+
+#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86)
+
+namespace mllm::cpu::x86 {
+
+void sigmoid_fp32(const mllm_fp32_t* __restrict X, mllm_fp32_t* __restrict Y, int len, int thread_count);
+
+}  // namespace mllm::cpu::x86
+
+#endif
diff --git a/mllm/backends/cpu/ops/LinearOp.cpp b/mllm/backends/cpu/ops/LinearOp.cpp
index 62f2392ca..f3c7bfa64 100644
--- a/mllm/backends/cpu/ops/LinearOp.cpp
+++ b/mllm/backends/cpu/ops/LinearOp.cpp
@@ -360,7 +360,7 @@ void CPULinearOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>
     }
     case aops::LinearImplTypes::kQNN_LPBQ_w4a16o16_G32:
     case aops::LinearImplTypes::kQNN_LPBQ_w4a16o16_G64: {
-      o_dtype = kInt16PerTensorSym;
+      o_dtype = kUInt16PerTensorAsy;
       break;
     }
     default: o_dtype = i.dtype();
diff --git a/mllm/backends/cpu/ops/SigmoidOp.cpp b/mllm/backends/cpu/ops/SigmoidOp.cpp
new file mode 100644
index 000000000..9bd8880f7
--- /dev/null
+++ b/mllm/backends/cpu/ops/SigmoidOp.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include <cstring>
+#include "mllm/backends/cpu/ops/SigmoidOp.hpp"
+#include "mllm/backends/cpu/kernels/Kernels.hpp"
+
+namespace mllm::cpu {
+
+CPUSigmoidOp::CPUSigmoidOp(const aops::SigmoidOpOptions& options) : aops::SigmoidOp(options) {}
+
+void CPUSigmoidOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  const auto& X = inputs[0];
+  auto& Y = outputs[0];
+
+  switch (X.dtype()) {
+    case kFloat32: {
+#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86)
+      x86::sigmoid_fp32(X.ptr<mllm_fp32_t>(), Y.ptr<mllm_fp32_t>(), X.numel(), options_.getThreads());
+#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)
+      arm::sigmoid_fp32(X.ptr<mllm_fp32_t>(), Y.ptr<mllm_fp32_t>(), X.numel(), options_.getThreads());
+#else
+      NYI("Sigmoid not supported for Other Architectures");
+#endif
+      break;
+    }
+    case kFloat16: {
+#if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86)
+      NYI("Sigmoid FP16 not implemented yet for X86");
+#elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)
+      arm::sigmoid_fp16(X.ptr<mllm_fp16_t>(), Y.ptr<mllm_fp16_t>(), X.numel(), options_.getThreads());
+#else
+      NYI("Sigmoid not supported for Other Architectures");
+#endif
+      break;
+    }
+    default: NYI("CPUSigmoidOp::forward not support dtype {}", nameOfType(X.dtype())); break;
+  }
+}
+
+}  // namespace mllm::cpu
diff --git a/mllm/backends/cpu/ops/SigmoidOp.hpp b/mllm/backends/cpu/ops/SigmoidOp.hpp
new file mode 100644
index 000000000..507cc44e5
--- /dev/null
+++ b/mllm/backends/cpu/ops/SigmoidOp.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/aops/SigmoidOp.hpp"
+
+namespace mllm::cpu {
+
+class CPUSigmoidOp final : public aops::SigmoidOp {
+ public:
+  explicit CPUSigmoidOp(const aops::SigmoidOpOptions& options);
+
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+};
+
+class CPUSigmoidOpFactory : public TypedOpFactory<OpTypes::kSigmoid, aops::SigmoidOpOptions> {
+ public:
+  std::shared_ptr<BaseOp> createOpImpl(const aops::SigmoidOpOptions& options) override {
+    return std::make_shared<CPUSigmoidOp>(options);
+  }
+};
+
+}  // namespace mllm::cpu
diff --git a/mllm/backends/qnn/aot/passes/AOTCompileContext.cpp b/mllm/backends/qnn/aot/passes/AOTCompileContext.cpp
index 43d8801d6..05fe55560 100644
--- a/mllm/backends/qnn/aot/passes/AOTCompileContext.cpp
+++ b/mllm/backends/qnn/aot/passes/AOTCompileContext.cpp
@@ -21,4 +21,8 @@ void AOTCompileContext::setConfig(const std::string& fp) {
 
 nlohmann::json& AOTCompileContext::getConfig() { return config_; }
 
+void AOTCompileContext::setParamFile(const ParameterFile::ptr_t& params) { params_ = params; }
+
+ParameterFile::ptr_t AOTCompileContext::getParamFile() { return params_; }
+
 }  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot/passes/AOTCompileContext.hpp b/mllm/backends/qnn/aot/passes/AOTCompileContext.hpp
index a9def31ef..9fdcff1c4 100644
--- a/mllm/backends/qnn/aot/passes/AOTCompileContext.hpp
+++ b/mllm/backends/qnn/aot/passes/AOTCompileContext.hpp
@@ -5,6 +5,7 @@
 
 #include <nlohmann/json.hpp>
 #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
+#include "mllm/core/ParameterFile.hpp"
 
 namespace mllm::qnn::aot {
 
@@ -29,12 +30,17 @@ class AOTCompileContext {
 
   nlohmann::json& getConfig();
 
+  void setParamFile(const ParameterFile::ptr_t& params);
+
+  ParameterFile::ptr_t getParamFile();
+
  private:
   // Private constructor
   AOTCompileContext() = default;
 
   QnnAOTEnv* env_ = nullptr;
   nlohmann::json config_;
+  ParameterFile::ptr_t params_;
 };
 
 }  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot/passes/AOTPipeline.cpp b/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
index b1caa2d13..c60c6aa78 100644
--- a/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
+++ b/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
@@ -6,6 +6,7 @@
 #include "mllm/backends/qnn/aot/passes/MarkTensorIO.hpp"
 #include "mllm/backends/qnn/aot/passes/MergeLLMHeadIntoMainGraphPass.hpp"
 #include "mllm/backends/qnn/aot/passes/OpNamingPass.hpp"
+#include "mllm/backends/qnn/aot/passes/PTQPass.hpp"
 #include "mllm/backends/qnn/aot/passes/SplitLLMGraphPass.hpp"
 
 namespace mllm::qnn::aot {
@@ -22,9 +23,10 @@ std::vector<std::shared_ptr<ir::Pass>> createQnnAOTLoweringPipeline(QnnAOTEnv* e
     ret.emplace_back(createOpNamingPass());
     ret.emplace_back(createMergeLLMHeadIntoMainGraphPass());
     ret.emplace_back(createLLMQuantRecipePass());
-    ret.emplace_back(createSplitLLMGraphPass());
-    ret.emplace_back(createMarkTensorIOPass());
-    ret.emplace_back(createLLM2QnnLoweringPass());
+    ret.emplace_back(createPTQPass());
+    // ret.emplace_back(createSplitLLMGraphPass());
+    // ret.emplace_back(createMarkTensorIOPass());
+    // ret.emplace_back(createLLM2QnnLoweringPass());
   } else {
     MLLM_WARN("This pass currently only supports LLM applications. Please ensure your config contains 'quant_recipe.llm_recipe "
               "= true'.");
diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
index e6a16b824..adada76ed 100644
--- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
+++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
@@ -38,6 +38,13 @@ void recursiveVisitGraph(const ir::IRContext::ptr_t& ctx,
       if (!some_op->getAttr("quant_recipe")) {
         for (auto& pattern : patterns_w_priority_) {
           if (pattern.second->isMatch(some_op)) {
+            for (auto& _named_pattern_ : _named_pattern) {
+              if (_named_pattern_.second == pattern.second) {
+                MLLM_INFO("LLMQuantizationRecipePass Processing op: {} with pass: {}",
+                          some_op->cast_<ir::linalg::LinalgIROp>()->getAOp()->getName(), _named_pattern_.first);
+              }
+            }
+
             if (!pattern.second->rewrite(iw, some_op)) {
               for (auto& _named_pattern_ : _named_pattern) {
                 if (_named_pattern_.second == pattern.second) {
@@ -87,6 +94,11 @@ ir::linalg::LinalgIRQuantizatonSpecAttr::ptr_t genSimpleQuantizationSpecAttr(con
       spec = ir::linalg::QuantizationSpecSymPerTensor::create(0, 65535, kUInt16, kFloat32, Tensor::nil());
       break;
     }
+    case kUInt16PerTensorAsy: {
+      spec =
+          ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65535, kUInt16, kFloat32, kInt32, Tensor::nil(), Tensor::nil());
+      break;
+    }
     case kUInt8:
     case kUInt16:
     case kUInt32:
@@ -241,6 +253,31 @@ ir::linalg::LinalgIRQuantizatonSpecAttr::ptr_t cloneQuantizationSpecType(
   return ctx->create<ir::linalg::LinalgIRQuantizatonSpecAttr>(cloned_spec);
 }
 
+//===----------------------------------------------------------------------===//
+// Sigmoid Pattern
+//===----------------------------------------------------------------------===//
+bool LLMQuantRecipeSigmoidPattern::isMatch(const mllm::ir::op_ptr_t& op) {
+  if (op->isa_<ir::linalg::SigmoidOp>()) { return true; }
+  return false;
+}
+
+bool LLMQuantRecipeSigmoidPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) {
+  return noSharingSingleInAndSingleOutQuantAnnoAttr(writer.getContext(), node->cast_<ir::linalg::LinalgIROp>());
+}
+
+//===----------------------------------------------------------------------===//
+// Negative Pattern
+//===----------------------------------------------------------------------===//
+bool LLMQuantRecipeNegPattern::isMatch(const mllm::ir::op_ptr_t& op) {
+  if (op->isa_<ir::linalg::NegOp>()) { return true; }
+  return false;
+}
+
+bool LLMQuantRecipeNegPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) {
+  return shareQuantSpecSingleInputToSingleOutputAndSetOpQuantAnnoAttr(writer.getContext(),
+                                                                      node->cast_<ir::linalg::LinalgIROp>());
+}
+
 //===----------------------------------------------------------------------===//
 // ReduceMin Pattern
 //===----------------------------------------------------------------------===//
@@ -352,7 +389,12 @@ bool LLMQuantRecipeRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_pt
 
   auto weight_spec_attr = cloneQuantizationSpecType(
       writer.getContext(), node->inputs().front()->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>());
-  weight_reg_tensor_ir->outputs().front()->setAttr("qnn_recipe", weight_spec_attr);
+  weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr);
+
+  // Get self anno
+  node->getAttr("quant_recipe")
+      ->cast_<ir::linalg::LinalgIRQuantizatonAnnotationAttr>()
+      ->annotation_.weights.insert({"weight", weight_spec_attr->spec_});
 
   return true;
 }
@@ -401,6 +443,19 @@ bool LLMQuantRecipeIndexPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_
   return true;
 }
 
+//===----------------------------------------------------------------------===//
+// Slice Pattern
+//===----------------------------------------------------------------------===//
+bool LLMQuantRecipeSlicePattern::isMatch(const mllm::ir::op_ptr_t& op) {
+  if (op->isa_<ir::linalg::SliceOp>()) { return true; }
+  return false;
+}
+
+bool LLMQuantRecipeSlicePattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) {
+  return shareQuantSpecSingleInputToSingleOutputAndSetOpQuantAnnoAttr(writer.getContext(),
+                                                                      node->cast_<ir::linalg::LinalgIROp>());
+}
+
 //===----------------------------------------------------------------------===//
 // Elementwise Pattern
 //===----------------------------------------------------------------------===//
@@ -422,23 +477,9 @@ bool LLMQuantRecipeElementwisePattern::rewrite(ir::IRWriter& writer, const ir::o
   // i_1 maybe a constant, we need to create quant recipe for it
   if (!i_1->getAttr("quant_recipe")) {
     if (i_1->getAttr("constant")) {
-      auto i_1_tensor = i_1->cast_<ir::tensor::TensorValue>()->tensor_;
-      switch (i_1_tensor.dtype()) {
-        case kUInt16:
-        case kUInt8:
-        case kInt16:
-        case kInt8:
-        case kFloat32:
-        case kFloat16:
-        case kBFloat16: {
-          i_1->setAttr("quant_recipe", writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(
-                                           ir::linalg::QuantizationSpecRaw::create(i_1_tensor.dtype())));
-          break;
-        }
-        default: {
-          NYI("Only support [int16, int8, bf16, f16, sf32] for now.");
-        }
-      }
+      i_1->setAttr("quant_recipe",
+                   cloneQuantizationSpecType(writer.getContext(),
+                                             i_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()));
 
     } else {
       MLLM_WARN("LLMQuantRecipeEqualPattern Only support constant Value as second inputs right now. Pls send us a issue or PR "
@@ -447,8 +488,6 @@ bool LLMQuantRecipeElementwisePattern::rewrite(ir::IRWriter& writer, const ir::o
     }
   }
 
-  MLLM_RETURN_FALSE_IF_NOT(i_1->getAttr("quant_recipe"));
-
   o_0->setAttr("quant_recipe", i_0->getAttr("quant_recipe"));
 
   auto annotation_attr = writer.create<ir::linalg::LinalgIRQuantizatonAnnotationAttr>();
@@ -738,7 +777,8 @@ bool LLMQuantRecipeLinearPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr
             ir::linalg::QuantizationSpecLPBQ::create(-8, 7, block_size, -1, 4, kUInt4, kFloat32, Tensor::nil(), Tensor::nil());
 
         // output sym int16
-        auto out_quant_spec = ir::linalg::QuantizationSpecSymPerTensor::create(-32768, 32767, kInt16, kFloat32, Tensor::nil());
+        auto out_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536, kUInt16, kFloat32, kInt32,
+                                                                                Tensor::nil(), Tensor::nil());
         linear_ir->outputs().front()->setAttr("quant_recipe",
                                               writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(out_quant_spec));
 
@@ -857,14 +897,32 @@ bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_
         o_quant_spec = ir::linalg::QuantizationSpecSymPerTensor::create(0, 65535, kUInt16, kFloat32, Tensor::nil());
         break;
       }
+      case kUInt16PerTensorAsy: {
+        o_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65535, kUInt16, kFloat32, kInt32, Tensor::nil(),
+                                                                         Tensor::nil());
+        break;
+      }
       default: {
         NYI("Only support [uint16, int16, uint8, int8], [sym] for now.");
       }
     }
 
+    // Weights
+    auto weight_name = embedding_op->getAOp()->getName() + ".weight";
+    auto weight_reg_tensor_ir = writer.getContext()->lookupSymbolTable(weight_name);
+    MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir);
+    MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->isa_<ir::tensor::RegisterOp>());
+    MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->outputs().front()->isa_<ir::tensor::TensorValue>());
+    auto weight_tensor = weight_reg_tensor_ir->outputs().front()->cast_<ir::tensor::TensorValue>();
+
     annotation_attr->annotation_.outputs.emplace_back(o_quant_spec);
     quantize_op->outputs().front()->setAttr("quant_recipe",
                                             writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(o_quant_spec));
+
+    // Embedding weight quantization method same as outputs, but not share, just same type
+    auto weight_spec_attr = genSimpleQuantizationSpecAttr(writer.getContext(), weight_tensor);
+    weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr);
+    annotation_attr->annotation_.weights.insert({"weight", weight_spec_attr->spec_});
   }
 
   // Attach to quantize node
@@ -941,6 +999,9 @@ bool LLMQuantRecipeQwen3AttentionPattern::rewrite(ir::IRWriter& writer, const ir
 LLMQuantRecipePass::LLMQuantRecipePass() {
   auto config = AOTCompileContext::getInstance().getConfig();
   // Register all patterns
+  addPattern(LLMQuantRecipeNegPattern::create(), "neg", 0);
+  addPattern(LLMQuantRecipeSlicePattern::create(), "slice", 0);
+  addPattern(LLMQuantRecipeSigmoidPattern::create(), "sigmoid", 0);
   addPattern(LLMQuantRecipeReduceMinPattern::create(), "reduce_min", 0);
   addPattern(LLMQuantRecipeRoPEPattern::create(), "rope", 0);
   addPattern(LLMQuantRecipeCastTypePattern::create(), "cast_type", 0);
@@ -958,9 +1019,6 @@ LLMQuantRecipePass::LLMQuantRecipePass() {
   addPattern(LLMQuantRecipeLinearPattern::create(), "linear", 0);
   addPattern(LLMQuantRecipeEmbeddingPattern::create(), "embedding", 0);
   addPattern(LLMQuantRecipeViewPattern::create(), "view", 0);
-  if (config["quant_recipe"]["builtin_llm_pass"]["model"] == "qwen3") {
-    addPattern(LLMQuantRecipeQwen3AttentionPattern::create(), "qwen3_attention", 100);
-  }
 }
 
 uint8_t LLMQuantRecipePass::run(const ir::node_ptr_t& op) {
diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.hpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.hpp
index dbb6d1dc1..abd7cdbcc 100644
--- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.hpp
+++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.hpp
@@ -32,6 +32,32 @@ bool noSharingSingleInAndSingleOutQuantAnnoAttr(const ir::IRContext::ptr_t& ctx,
 ir::linalg::LinalgIRQuantizatonSpecAttr::ptr_t cloneQuantizationSpecType(
     const ir::IRContext::ptr_t& ctx, const ir::linalg::LinalgIRQuantizatonSpecAttr::ptr_t& from);
 
+//===----------------------------------------------------------------------===//
+// Sigmoid Pattern
+//===----------------------------------------------------------------------===//
+class LLMQuantRecipeSigmoidPattern : public ir::Pattern {
+ public:
+  bool isMatch(const mllm::ir::op_ptr_t& op) override;
+
+  bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) override;
+
+  static inline std::shared_ptr<LLMQuantRecipeSigmoidPattern> create() {
+    return std::make_shared<LLMQuantRecipeSigmoidPattern>();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Negative Pattern
+//===----------------------------------------------------------------------===//
+class LLMQuantRecipeNegPattern : public ir::Pattern {
+ public:
+  bool isMatch(const mllm::ir::op_ptr_t& op) override;
+
+  bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) override;
+
+  static inline std::shared_ptr<LLMQuantRecipeNegPattern> create() { return std::make_shared<LLMQuantRecipeNegPattern>(); }
+};
+
 //===----------------------------------------------------------------------===//
 // ReduceMin Pattern
 //===----------------------------------------------------------------------===//
@@ -110,6 +136,18 @@ class LLMQuantRecipeIndexPattern : public ir::Pattern {
   static inline std::shared_ptr<LLMQuantRecipeIndexPattern> create() { return std::make_shared<LLMQuantRecipeIndexPattern>(); }
 };
 
+//===----------------------------------------------------------------------===//
+// Slice Pattern
+//===----------------------------------------------------------------------===//
+class LLMQuantRecipeSlicePattern : public ir::Pattern {
+ public:
+  bool isMatch(const mllm::ir::op_ptr_t& op) override;
+
+  bool rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) override;
+
+  static inline std::shared_ptr<LLMQuantRecipeSlicePattern> create() { return std::make_shared<LLMQuantRecipeSlicePattern>(); }
+};
+
 //===----------------------------------------------------------------------===//
 // Elementwise Pattern
 //===----------------------------------------------------------------------===//
diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp
index e69de29bb..9d4cabee3 100644
--- a/mllm/backends/qnn/aot/passes/PTQPass.cpp
+++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp
@@ -0,0 +1,29 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/backends/qnn/aot/passes/PTQPass.hpp"
+#include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
+#include "mllm/compile/ir/builtin/Op.hpp"
+#include "mllm/compile/ir/graph/Op.hpp"
+#include "mllm/compile/ir/linalg/Op.hpp"
+#include "mllm/compile/ir/tensor/Value.hpp"
+#include "mllm/compile/ir/cf/Op.hpp"
+#include "mllm/compile/ir/Node.hpp"
+#include "mllm/core/OpTypes.hpp"
+#include "mllm/utils/Common.hpp"
+
+namespace mllm::qnn::aot {
+
+namespace {
+
+void solveStaticWeights() {}
+
+void solveStaticRoPE() {}
+
+}  // namespace
+
+uint8_t PTQPass::run(const ir::node_ptr_t& op) { return ir::PASS_RET_SUCCESS; }
+
+ir::Pass::ptr_t createPTQPass() { return std::make_shared<PTQPass>(); }
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot/passes/PTQPass.hpp b/mllm/backends/qnn/aot/passes/PTQPass.hpp
index e69de29bb..6d6d35305 100644
--- a/mllm/backends/qnn/aot/passes/PTQPass.hpp
+++ b/mllm/backends/qnn/aot/passes/PTQPass.hpp
@@ -0,0 +1,32 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/compile/passes/Pass.hpp"
+#include "mllm/compile/ir/Node.hpp"
+
+namespace mllm::qnn::aot {
+
+//===----------------------------------------------------------------------===//
+// PTQPass - Post-Training Quantization Pass
+// This pass applies post-training quantization transformations to the IR.
+// It walks through the computation graph and applies quantization
+// based on configuration parameters.
+//===----------------------------------------------------------------------===//
+class PTQPass final : public ir::Pass {
+ public:
+  PTQPass() = default;
+
+  ~PTQPass() override = default;
+
+  // Run the PTQ pass on the given operation
+  // Expected input: ModuleOp containing the computation graph
+  // Output: Modified IR with PTQ transformations applied
+  uint8_t run(const ir::node_ptr_t& op) override;
+};
+
+// Factory function to create PTQPass instance
+ir::Pass::ptr_t createPTQPass();
+
+}  // namespace mllm::qnn::aot
diff --git a/mllm/compile/ir/GeneratedRTTIKind.hpp b/mllm/compile/ir/GeneratedRTTIKind.hpp
index 0f48660a6..9c48d0535 100644
--- a/mllm/compile/ir/GeneratedRTTIKind.hpp
+++ b/mllm/compile/ir/GeneratedRTTIKind.hpp
@@ -1,4 +1,4 @@
-// Auto generated: 2025-12-29 05:14:54
+// Auto generated: 2026-01-04 13:13:09
 // do not modify this file
 #pragma once
 
@@ -85,6 +85,7 @@ enum NodeKind : uint32_t {
   RK_Op_LinalgIROp_RadixAttnSwaSinkOp,
   RK_Op_LinalgIROp_EqualOp,
   RK_Op_LinalgIROp_WhereOp,
+  RK_Op_LinalgIROp_SigmoidOp,
   RK_Op_LinalgIROp_CustomizedOp,
   RK_Op_LinalgIROp_Last,
   RK_Op_GraphIROp,
diff --git a/mllm/compile/ir/NodeRTTIClassOfImpl.hpp b/mllm/compile/ir/NodeRTTIClassOfImpl.hpp
index c7de7f72e..1e631c8de 100644
--- a/mllm/compile/ir/NodeRTTIClassOfImpl.hpp
+++ b/mllm/compile/ir/NodeRTTIClassOfImpl.hpp
@@ -1,4 +1,4 @@
-// Auto generated: 2025-12-29 05:14:54
+// Auto generated: 2026-01-04 13:13:09
 // do not modify this file
 #pragma once
 namespace mllm::ir {
@@ -226,6 +226,9 @@ struct NodeRTTIClassOfImpl {
 #define RTTI_RK_OP_LINALGIROP_WHEREOP_IMPL(v) \
   return (v)->getKind() >= RK_Op_LinalgIROp_WhereOp && (v)->getKind() <= RK_Op_LinalgIROp_WhereOp
 
+#define RTTI_RK_OP_LINALGIROP_SIGMOIDOP_IMPL(v) \
+  return (v)->getKind() >= RK_Op_LinalgIROp_SigmoidOp && (v)->getKind() <= RK_Op_LinalgIROp_SigmoidOp
+
 #define RTTI_RK_OP_LINALGIROP_CUSTOMIZEDOP_IMPL(v) \
   return (v)->getKind() >= RK_Op_LinalgIROp_CustomizedOp && (v)->getKind() <= RK_Op_LinalgIROp_CustomizedOp
 
diff --git a/mllm/compile/ir/linalg/Attribute.hpp b/mllm/compile/ir/linalg/Attribute.hpp
index 34a116c5c..576530362 100644
--- a/mllm/compile/ir/linalg/Attribute.hpp
+++ b/mllm/compile/ir/linalg/Attribute.hpp
@@ -60,6 +60,7 @@ struct QuantizationSpec {
   using ptr_t = std::shared_ptr<QuantizationSpec>;
   QuantizationSpecType type;
   uint64_t uuid;
+  bool solved = false;
 };
 
 struct QuantizationSpecRaw : public QuantizationSpec {
diff --git a/mllm/compile/ir/linalg/Op.cpp b/mllm/compile/ir/linalg/Op.cpp
index 320dcf5d3..c303e9d3e 100644
--- a/mllm/compile/ir/linalg/Op.cpp
+++ b/mllm/compile/ir/linalg/Op.cpp
@@ -116,6 +116,7 @@ LINALG_AOPS_DECL(OpTypes::kArgsort, ArgsortOp);
 
 LINALG_AOPS_DECL(OpTypes::kEqual, EqualOp);
 LINALG_AOPS_DECL(OpTypes::kWhere, WhereOp);
+LINALG_AOPS_DECL(OpTypes::kSigmoid, SigmoidOp);
 
 // Customized Ops
 LINALG_AOPS_DECL(OpTypes::kFlashAttention2WithSinkAndSwa, FlashAttention2SwaSinkOp);
diff --git a/mllm/compile/ir/linalg/Op.hpp b/mllm/compile/ir/linalg/Op.hpp
index 7e93a288f..a737a623a 100644
--- a/mllm/compile/ir/linalg/Op.hpp
+++ b/mllm/compile/ir/linalg/Op.hpp
@@ -79,6 +79,7 @@ class RadixAttnRelaxOp;
 class RadixAttnSwaSinkOp;
 class EqualOp;
 class WhereOp;
+class SigmoidOp;
 }  // namespace mllm
 
 #define LINALG_AOPS_DEFINE(class_name, rtti_name)                                                                       \
@@ -251,6 +252,7 @@ LINALG_AOPS_DEFINE(WhereOp, WHEREOP);
 LINALG_AOPS_DEFINE(FlashAttention2SwaSinkOp, FLASHATTENTION2SWASINKOP);
 LINALG_AOPS_DEFINE(RadixAttnRelaxOp, RADIXATTNRELAXOP);
 LINALG_AOPS_DEFINE(RadixAttnSwaSinkOp, RADIXATTNSWASINKOP);
+LINALG_AOPS_DEFINE(SigmoidOp, SIGMOIDOP);
 
 /**
  * @brief CustomizedOp: A generic operation type for implementing backend-specific operations
diff --git a/mllm/compile/ir/rtti_kind_gen.py b/mllm/compile/ir/rtti_kind_gen.py
index cb2ad4d52..7615e323b 100644
--- a/mllm/compile/ir/rtti_kind_gen.py
+++ b/mllm/compile/ir/rtti_kind_gen.py
@@ -290,6 +290,7 @@ def define_lianlg_ir(ir: dict):
     op.derive(Cls("RadixAttnSwaSinkOp"))
     op.derive(Cls("EqualOp"))
     op.derive(Cls("WhereOp"))
+    op.derive(Cls("SigmoidOp"))
 
     # customized ops
     op.derive(Cls("CustomizedOp"))
diff --git a/mllm/core/DataTypes.cpp b/mllm/core/DataTypes.cpp
index 0cc549b71..b349eb292 100644
--- a/mllm/core/DataTypes.cpp
+++ b/mllm/core/DataTypes.cpp
@@ -48,8 +48,11 @@ size_t lanesOfType(DataTypes dtype) {
     CASE(kComplexFloat64)
     CASE(kInt16PerTensorSym)
     CASE(kInt8PerTensorSym)
+    CASE(kUInt8PerTensorSym)
+    CASE(kUInt16PerTensorAsy)
+    CASE(kUInt16PerTensorSym)
     case kByte: return MllmDataTypeInfo<kUInt8>::lanes();
-    default: NYI("Unknown data type");
+    default: NYI("Unknown data type {}", (int32_t)dtype);
   }
   return 1;
 #undef CASE
@@ -96,7 +99,10 @@ size_t bytesOfType(DataTypes dtype) {
     CASE(kComplexFloat32)
     CASE(kComplexFloat64)
     CASE(kInt16PerTensorSym)
+    CASE(kUInt16PerTensorSym)
+    CASE(kUInt16PerTensorAsy)
     CASE(kInt8PerTensorSym)
+    CASE(kUInt8PerTensorSym)
     CASE(kInt4)
     CASE(kUInt4)
     case kByte: return MllmDataTypeInfo<kUInt8>::bytes();
@@ -147,7 +153,10 @@ std::string nameOfType(DataTypes dtype) {
     CASE(kComplexFloat32)
     CASE(kComplexFloat64)
     CASE(kInt16PerTensorSym)
+    CASE(kUInt16PerTensorSym)
+    CASE(kUInt16PerTensorAsy)
     CASE(kInt8PerTensorSym)
+    CASE(kUInt8PerTensorSym)
     CASE(kInt4)
     CASE(kUInt4)
     case kByte: return MllmDataTypeInfo<kUInt8>::name();
diff --git a/mllm/core/OpTypes.hpp b/mllm/core/OpTypes.hpp
index 2b9916a90..849df8941 100644
--- a/mllm/core/OpTypes.hpp
+++ b/mllm/core/OpTypes.hpp
@@ -95,6 +95,8 @@ enum class OpTypes : int32_t {
   kEqual = 73,
   kWhere = 74,
 
+  kSigmoid = 75,
+
   // Dynamic Op Start for user to register there own ops.
   kDynamicOp_Start = 4096,
 
diff --git a/mllm/core/Tensor.cpp b/mllm/core/Tensor.cpp
index e151341d9..ee0d69752 100644
--- a/mllm/core/Tensor.cpp
+++ b/mllm/core/Tensor.cpp
@@ -32,12 +32,12 @@ namespace mllm {
 
 void Tensor::operator delete(void* ptr) noexcept {
   ((Tensor*)ptr)->impl_.reset();
-  for (auto& [a, _] : ((Tensor*)ptr)->attached_views_) { ((Tensor*)ptr)->attached_views_[a].reset(); }
+  for (auto& [a, _] : ((Tensor*)ptr)->impl_->attachedViews()) { ((Tensor*)ptr)->impl_->attachedViews()[a].reset(); }
 }
 
 void Tensor::delete_() noexcept {
   this->impl_.reset();
-  for (auto& [a, _] : this->attached_views_) { this->attached_views_[a].reset(); }
+  for (auto& [a, _] : this->impl_->attachedViews()) { this->impl_->attachedViews()[a].reset(); }
 }
 
 /**
@@ -75,6 +75,21 @@ Tensor Tensor::empty(const std::vector<int32_t>& shape, DataTypes dtype, DeviceT
   return Tensor(impl);
 }
 
+Tensor Tensor::constant(float x, DataTypes dtype, DeviceTypes device) {
+  auto rhs_tensor = Tensor::empty({1}, dtype, device).alloc();
+  switch (dtype) {
+    case kFloat32: *(rhs_tensor.ptr<float>()) = x; break;
+    case kFloat16: *(rhs_tensor.ptr<half_float::half>()) = half_float::half(x); break;
+    case kInt32: *(rhs_tensor.ptr<int32_t>()) = x; break;
+    case kInt16: *(rhs_tensor.ptr<int16_t>()) = x; break;
+    case kInt8: *(rhs_tensor.ptr<int8_t>()) = x; break;
+    case kInt16PerTensorSym: *(rhs_tensor.ptr<int16_t>()) = x; break;
+    case kUInt16PerTensorAsy: *(rhs_tensor.ptr<int16_t>()) = x; break;
+    default: NYI("Type is not supported"); break;
+  }
+  return rhs_tensor;
+}
+
 Tensor Tensor::emptyLike(const Tensor& liked_tensor) {
   auto ret = Tensor::empty(liked_tensor.shape(), liked_tensor.dtype(), liked_tensor.device());
   return ret;
@@ -82,16 +97,16 @@ Tensor Tensor::emptyLike(const Tensor& liked_tensor) {
 
 Tensor& Tensor::allocExtraTensorView(const std::string& extra_tensor_name, const std::vector<int32_t>& shape, DataTypes dtype,
                                      DeviceTypes device) {
-  MLLM_RT_ASSERT_EQ(attached_views_.count(extra_tensor_name), 0);
+  MLLM_RT_ASSERT_EQ(impl_->attachedViews().count(extra_tensor_name), 0);
   auto storage = TensorStorage::create(shape, dtype, device);
   auto impl = TensorViewImpl::create(shape, storage);
-  attached_views_.insert({extra_tensor_name, impl});
+  impl_->attachedViews().insert({extra_tensor_name, impl});
   return *this;
 }
 
 Tensor Tensor::getExtraTensorViewInTensor(const std::string& extra_tensor_name) {
-  MLLM_RT_ASSERT_EQ(attached_views_.count(extra_tensor_name), 1);
-  return Tensor(attached_views_.at(extra_tensor_name));
+  MLLM_RT_ASSERT_EQ(impl_->attachedViews().count(extra_tensor_name), 1);
+  return Tensor(impl_->attachedViews().at(extra_tensor_name));
 }
 
 Tensor Tensor::zeros(const std::vector<int32_t>& shape, DataTypes dtype, DeviceTypes device) {
@@ -275,6 +290,27 @@ Tensor Tensor::mul(float rhs, DataTypes data_type) {
   return Context::instance().buildOpAndSubmitTask(OpTypes::kMul, opts, {*this, rhs_tensor})[0];
 }
 
+Tensor Tensor::addConstant(Tensor rhs) {
+  auto opts = aops::AddOpOptions{};
+  opts.setInputsConstant(0, 0);
+  opts.setInputsConstant(1, 1);
+  return Context::instance().buildOpAndSubmitTask(OpTypes::kAdd, opts, {*this, rhs})[0];  // NOLINT
+}
+
+Tensor Tensor::subConstant(Tensor rhs) {
+  auto opts = aops::SubOpOptions{};
+  opts.setInputsConstant(0, 0);
+  opts.setInputsConstant(1, 1);
+  return Context::instance().buildOpAndSubmitTask(OpTypes::kSub, opts, {*this, rhs})[0];  // NOLINT
+}
+
+Tensor Tensor::mulConstant(Tensor rhs) {
+  auto opts = aops::MulOpOptions{};
+  opts.setInputsConstant(0, 0);
+  opts.setInputsConstant(1, 1);
+  return Context::instance().buildOpAndSubmitTask(OpTypes::kMul, opts, {*this, rhs})[0];  // NOLINT
+}
+
 Tensor Tensor::operator/(float rhs) {
   auto rhs_tensor = Tensor::empty({1}, dtype(), device()).alloc();
   if (device() != kCPU) {
@@ -485,14 +521,14 @@ size_t Tensor::hash() const {
   std::vector<uint32_t> heap_buf;
 
   auto* buf = stack_buf;
-  size_t count = 1 + attached_views_.size();
+  size_t count = 1 + impl_->attachedViews().size();
   if (count > kStackCap) {
     heap_buf.resize(count);
     buf = heap_buf.data();
   }
   buf[0] = uuid();
   size_t idx = 1;
-  for (const auto& [_, view] : attached_views_) { buf[idx++] = view ? view->uuid() : 0u; }
+  for (const auto& [_, view] : impl_->attachedViews()) { buf[idx++] = view ? view->uuid() : 0u; }
   return XXH64(buf, count * sizeof(uint32_t), 0);
 }
 
diff --git a/mllm/core/Tensor.hpp b/mllm/core/Tensor.hpp
index 334441501..96a375622 100644
--- a/mllm/core/Tensor.hpp
+++ b/mllm/core/Tensor.hpp
@@ -175,6 +175,8 @@ class Tensor {
    */
   static Tensor empty(const std::vector<int32_t>& shape, DataTypes dtype = kFloat32, DeviceTypes device = kCPU);
 
+  static Tensor constant(float x, DataTypes dtype = kFloat32, DeviceTypes device = kCPU);
+
   /**
    * @brief Creates an uninitialized tensor with the same shape and attributes as another tensor.
    *
@@ -290,6 +292,10 @@ class Tensor {
   Tensor sub(float rhs, DataTypes data_type = kFloat32);
   Tensor mul(float rhs, DataTypes data_type = kFloat32);
 
+  Tensor addConstant(Tensor rhs);
+  Tensor subConstant(Tensor rhs);
+  Tensor mulConstant(Tensor rhs);
+
   /// @name Scalar Operations with complex rhs type
   /// Element-wise operations with complex rhs type scalar values.
   /// @{
@@ -692,16 +698,15 @@ class Tensor {
     return *(const_cast<Tensor*>(this)->offsettedPtr<T>(offsets));
   }
 
-  [[nodiscard]] std::unordered_map<std::string, TensorViewImpl::ptr_t>& attachedViews() { return attached_views_; }
+  [[nodiscard]] std::unordered_map<std::string, TensorViewImpl::ptr_t>& attachedViews() { return impl_->attachedViews(); }
 
-  void attach(const std::string& name, const TensorViewImpl::ptr_t& view) { attached_views_[name] = view; }
+  void attach(const std::string& name, const TensorViewImpl::ptr_t& view) { impl_->attachedViews()[name] = view; }
 
  private:
   template<typename T>
   friend __LinkedTensor operator<<(const Tensor& t, T first);
 
   std::shared_ptr<TensorViewImpl> impl_ = nullptr;
-  std::unordered_map<std::string, TensorViewImpl::ptr_t> attached_views_;
 };
 
 template<typename T>
diff --git a/mllm/core/TensorViewImpl.hpp b/mllm/core/TensorViewImpl.hpp
index 536148203..4b7b146b7 100644
--- a/mllm/core/TensorViewImpl.hpp
+++ b/mllm/core/TensorViewImpl.hpp
@@ -89,12 +89,15 @@ class TensorViewImpl : public std::enable_shared_from_this<TensorViewImpl> {
 
   inline void dropStorage() { storage_ = nullptr; }
 
+  inline std::unordered_map<std::string, TensorViewImpl::ptr_t>& attachedViews() { return attached_views_; }
+
  private:
   int32_t shape_len_ = 0;
   int32_t storage_offset_ = 0;
   int32_t shape_[MLLM_TENSOR_SHAPE_MAX_LEN];
   int32_t stride_[MLLM_TENSOR_SHAPE_MAX_LEN];
   std::shared_ptr<TensorStorage> storage_ = nullptr;
+  std::unordered_map<std::string, TensorViewImpl::ptr_t> attached_views_;
 };
 
 }  // namespace mllm
diff --git a/mllm/core/aops/ElewiseOps.cpp b/mllm/core/aops/ElewiseOps.cpp
index 85c3027f8..1bf8c60dd 100644
--- a/mllm/core/aops/ElewiseOps.cpp
+++ b/mllm/core/aops/ElewiseOps.cpp
@@ -112,7 +112,6 @@ __MLLM_ELEWISE_OP_IMPL(kAdd, AddOp);
 __MLLM_ELEWISE_OP_IMPL(kSub, SubOp);
 __MLLM_ELEWISE_OP_IMPL(kMul, MulOp);
 __MLLM_ELEWISE_OP_IMPL(kDiv, DivOp);
-__MLLM_ELEWISE_OP_IMPL(kNeg, NegOp);
 
 // ---------- Unary Ops
 __MLLM_ELEWISE_UNARY_OP_IMPL(kAbs, AbsOp);
@@ -121,6 +120,7 @@ __MLLM_ELEWISE_UNARY_OP_IMPL(kClip, ClipOp);
 __MLLM_ELEWISE_UNARY_OP_IMPL(kExp, ExpOp);
 __MLLM_ELEWISE_UNARY_OP_IMPL(kSin, SinOp);
 __MLLM_ELEWISE_UNARY_OP_IMPL(kCos, CosOp);
+__MLLM_ELEWISE_UNARY_OP_IMPL(kNeg, NegOp);
 
 }  // namespace mllm::aops
 
diff --git a/mllm/core/aops/ParamOp.cpp b/mllm/core/aops/ParamOp.cpp
index a8e10c3d9..ffa161570 100644
--- a/mllm/core/aops/ParamOp.cpp
+++ b/mllm/core/aops/ParamOp.cpp
@@ -2,6 +2,8 @@
 // Licensed under the MIT License.
 
 #include "mllm/core/aops/ParamOp.hpp"
+#include "mllm/compile/ir/graph/Op.hpp"
+#include "mllm/compile/ir/tensor/Op.hpp"
 #include "mllm/core/BaseOp.hpp"
 #include "mllm/core/Tensor.hpp"
 #include "mllm/utils/Common.hpp"
@@ -31,14 +33,16 @@ void ParamOp::load(const ParameterFile::ptr_t& ploader) {
 
 void ParamOp::trace(void* trace_context, const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
   auto ir_ctx = (ir::IRContext*)trace_context;
-  auto i_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, inputs);
-  auto o_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, outputs);
-  ir_ctx->create<ir::linalg::ParamOp>(shared_from_this(), i_irs, o_irs);
+  // Register Params
+  if (weight_ && !ir_ctx->lookupSymbolTable(getName())) {
+    ir::IRWriterGuard guard(ir_ctx, ir_ctx->lookupSymbolTable("init")->cast_<ir::graph::SubGraphOp>()->getTopRegion());
+    ir_ctx->create<ir::tensor::RegisterOp>(ir_ctx->create<ir::tensor::TensorValue>(weight_));
+  }
 }
 
 void ParamOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) { MLLM_EMPTY_SCOPE; }
 
-void ParamOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) { MLLM_EMPTY_SCOPE; }
+void ParamOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) { outputs.emplace_back(weight_); }
 
 void ParamOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) { MLLM_EMPTY_SCOPE; }
 
diff --git a/mllm/core/aops/SigmoidOp.cpp b/mllm/core/aops/SigmoidOp.cpp
new file mode 100644
index 000000000..a57d89255
--- /dev/null
+++ b/mllm/core/aops/SigmoidOp.cpp
@@ -0,0 +1,37 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#include "mllm/core/aops/SigmoidOp.hpp"
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/Tensor.hpp"
+#include "mllm/utils/Common.hpp"
+#include "mllm/compile/ir/linalg/Op.hpp"
+
+namespace mllm::aops {
+
+SigmoidOp::SigmoidOp(const SigmoidOpOptions& options) : BaseOp(OpTypes::kSigmoid), options_(options) {}
+
+void SigmoidOp::load(const ParameterFile::ptr_t& ploader) { MLLM_EMPTY_SCOPE; }
+
+void SigmoidOp::trace(void* trace_context, const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  auto ir_ctx = (ir::IRContext*)trace_context;
+  auto i_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, inputs);
+  auto o_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, outputs);
+  ir_ctx->create<ir::linalg::SigmoidOp>(shared_from_this(), i_irs, o_irs);
+}
+
+void SigmoidOp::forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  NYI("SigmoidOp::forward not implemented in aops base.");
+}
+
+void SigmoidOp::reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) {
+  if (options_.isInplace()) {
+    outputs.emplace_back(inputs[0]);
+  } else {
+    outputs.emplace_back(Tensor::empty(inputs[0].shape(), inputs[0].dtype(), inputs[0].device()));
+  }
+}
+
+void SigmoidOp::setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) { BaseOp::setup(inputs, outputs); }
+
+}  // namespace mllm::aops
diff --git a/mllm/core/aops/SigmoidOp.hpp b/mllm/core/aops/SigmoidOp.hpp
new file mode 100644
index 000000000..29c5651d5
--- /dev/null
+++ b/mllm/core/aops/SigmoidOp.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "mllm/core/BaseOp.hpp"
+#include "mllm/core/ParameterFile.hpp"
+
+namespace mllm::aops {
+
+struct SigmoidOpOptions : public BaseOpOptions<SigmoidOpOptions> {};
+
+class SigmoidOp : public BaseOp {
+ public:
+  explicit SigmoidOp(const SigmoidOpOptions& options);
+
+  void load(const ParameterFile::ptr_t& ploader) override;
+
+  void trace(void* trace_context, const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+
+  void forward(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+
+  void reshape(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+
+  void setup(const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs) override;
+
+  inline SigmoidOpOptions& options() { return options_; }
+
+ protected:
+  SigmoidOpOptions options_;
+};
+
+}  // namespace mllm::aops
diff --git a/mllm/nn/Functional.cpp b/mllm/nn/Functional.cpp
index c863ee169..ab0c12f80 100644
--- a/mllm/nn/Functional.cpp
+++ b/mllm/nn/Functional.cpp
@@ -8,6 +8,7 @@
 #include "mllm/core/aops/MatMulOp.hpp"
 #include "mllm/core/aops/ReduceOps.hpp"
 #include "mllm/core/aops/Scatter2ShardsOp.hpp"
+#include "mllm/core/aops/SigmoidOp.hpp"
 #include "mllm/core/aops/SoftmaxOp.hpp"
 #include "mllm/core/aops/ElewiseOps.hpp"
 #include "mllm/core/aops/SplitOp.hpp"
@@ -205,4 +206,9 @@ mllm::Tensor where(const Tensor& mask, const Tensor& original, const Tensor& v)
   return ctx.buildOpAndSubmitTask(OpTypes::kWhere, aops::WhereOpOptions{}, {mask, original, v})[0];
 }
 
+mllm::Tensor sigmoid(const Tensor& x) {
+  auto& ctx = mllm::Context::instance();
+  return ctx.buildOpAndSubmitTask(OpTypes::kSigmoid, aops::SigmoidOpOptions{}, {x})[0];
+}
+
 }  // namespace mllm::nn::functional
diff --git a/mllm/nn/Functional.hpp b/mllm/nn/Functional.hpp
index 1a2f6d3df..bd0cca9dd 100644
--- a/mllm/nn/Functional.hpp
+++ b/mllm/nn/Functional.hpp
@@ -160,4 +160,6 @@ mllm::Tensor radixAttnRelax(const mllm::Tensor& Q, const mllm::Tensor& K_idx, co
 
 mllm::Tensor where(const Tensor& mask, const Tensor& original, const Tensor& v);
 
+mllm::Tensor sigmoid(const Tensor& x);
+
 }  // namespace mllm::nn::functional
diff --git a/mllm/nn/Module.cpp b/mllm/nn/Module.cpp
index d8328bb9e..55b079ac7 100644
--- a/mllm/nn/Module.cpp
+++ b/mllm/nn/Module.cpp
@@ -22,7 +22,7 @@ void ModuleImpl::load(const ParameterFile::ptr_t& param_file) {
       case AbstractNnNodeTypes::kLayer: std::static_pointer_cast<LayerImpl>(hb)->load(param_file); break;
     }
   }
-  resources_mapped_files_.push_back(param_file->getMappedFile());
+  resources_mapped_files_.push_back(param_file);
 }
 
 ParameterFile::ptr_t ModuleImpl::params(ModelFileVersion v) {
@@ -75,6 +75,11 @@ Tensor ModuleImpl::getBuffer(const std::string& name) { return buffer_[name]; }
 
 void ModuleImpl::updateBuffer(const std::string& name, const Tensor& tensor) { buffer_[name] = tensor; }
 
+ParameterFile::ptr_t ModuleImpl::getTopParameterFile() {
+  if (resources_mapped_files_.empty()) { return nullptr; }
+  return resources_mapped_files_.back();
+}
+
 Module::Module() {
   impl_ = std::make_shared<ModuleImpl>();
   impl()->setName("");
@@ -93,6 +98,8 @@ ModuleImpl::ptr_t Module::impl() const { return impl_; }
 
 void Module::to(DeviceTypes device_type) { impl()->to(device_type); }
 
+ParameterFile::ptr_t Module::getTopParameterFile() { return impl_->getTopParameterFile(); }
+
 void Module::load(const ParameterFile::ptr_t& param_file) { impl_->load(param_file); }
 
 std::vector<Tensor> Module::forward(const std::vector<Tensor>& inputs, const std::vector<AnyValue>& args) { return {}; }
diff --git a/mllm/nn/Module.hpp b/mllm/nn/Module.hpp
index 79e79dbb0..4965aa6a5 100644
--- a/mllm/nn/Module.hpp
+++ b/mllm/nn/Module.hpp
@@ -37,10 +37,12 @@ class ModuleImpl : public AbstractNnNode {
 
   void updateBuffer(const std::string& name, const Tensor& tensor);
 
+  ParameterFile::ptr_t getTopParameterFile();
+
  private:
   /// Buffer is tensors that will not shown in params. And will not be saved.
   SymbolTable<std::string, Tensor> buffer_;
-  std::vector<MappedFile::ptr_t> resources_mapped_files_;
+  std::vector<ParameterFile::ptr_t> resources_mapped_files_;
 };
 
 template<typename T>
@@ -66,6 +68,8 @@ class Module {
 
   [[nodiscard]] DeviceTypes device() const { return impl_->getDevice(); }
 
+  ParameterFile::ptr_t getTopParameterFile();
+
   /**
    * @brief Register a module/layer into this module
    *
diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py
index c7bc351de..ce67729f4 100644
--- a/pymllm/backends/qualcomm/transformers/core/qdq.py
+++ b/pymllm/backends/qualcomm/transformers/core/qdq.py
@@ -5,46 +5,70 @@
 
 class ActivationQDQ(nn.Module):
     """
-    General activation value pseudo-quantization module (QDQ).
-    Supports symmetric Per-Tensor quantization, configurable bit numbers (e.g., 8-bit or 16-bit).
+    General activation Quantization-DeQuantization (QDQ) module.
+    Supports both Symmetric and Asymmetric (Affine) quantization.
+    Uses torch.qint32 as a unified type to support various bit-widths.
     """
 
-    def __init__(self, bits=8, qscheme=torch.per_tensor_symmetric):
+    def __init__(self, bits=8, qscheme=torch.per_tensor_affine):
         super().__init__()
+        self.bits = bits
+        self.qscheme = qscheme
 
-        # 1. Calculate quantization range based on bits
-        # int8: -128 to 127
-        # int16: -32768 to 32767
-        self.quant_min = -(2 ** (bits - 1))
-        self.quant_max = 2 ** (bits - 1) - 1
+        # Define the simulation dtype as qint32 to avoid overflow across different bit-widths
+        self.dtype = torch.qint32
+
+        # 1. Calculate quantization range based on bits and scheme
+        if qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]:
+            # Symmetric: range is [-(2^(bits-1)), 2^(bits-1) - 1]
+            # e.g., 8-bit: -128 to 127
+            self.quant_min = -(2 ** (bits - 1))
+            self.quant_max = 2 ** (bits - 1) - 1
+        else:
+            # Asymmetric (Affine): range is [0, 2^bits - 1]
+            # e.g., 8-bit: 0 to 255
+            self.quant_min = 0
+            self.quant_max = (2**bits) - 1
 
         # 2. Initialize FakeQuantize
-        # For activations, typically use MinMaxObserver or MovingAverageMinMaxObserver
+        # MinMaxObserver calculates scale and zero_point based on observed tensors.
+        # Passing quant_min/max to the observer ensures consistency.
         self.fake_quant = FakeQuantize(
-            observer=MinMaxObserver.with_args(qscheme=qscheme, dtype=torch.qint32),
+            observer=MinMaxObserver.with_args(
+                qscheme=self.qscheme,
+                dtype=self.dtype,
+                quant_min=self.quant_min,
+                quant_max=self.quant_max,
+                reduce_range=False,
+            ),
             quant_min=self.quant_min,
             quant_max=self.quant_max,
-            dtype=torch.qint32,
-            qscheme=qscheme,
+            dtype=self.dtype,
+            qscheme=self.qscheme,
         )
 
     def forward(self, x):
-        # Directly apply pseudo-quantization.
-        # When observer is enabled, it continuously updates scale/zp;
-        # When fakequant is enabled, it simulates quantization errors.
+        # Applies fake quantization: rounds to nearest integer and clamps to [min, max],
+        # then dequantizes back to float to simulate quantization noise.
         return self.fake_quant(x)
 
+    # Control methods for quantization-aware training (QAT)
     def enable_observer(self):
+        """Enable tracking of min/max values to update scale and zero_point."""
         self.fake_quant.enable_observer()
 
     def disable_observer(self):
+        """Freeze scale and zero_point calculation."""
         self.fake_quant.disable_observer()
 
     def enable_fakequant(self):
+        """Enable simulation of quantization error."""
         self.fake_quant.enable_fakequant()
 
     def disable_fakequant(self):
+        """Disable quantization simulation (act as identity)."""
         self.fake_quant.disable_fakequant()
 
     def extra_repr(self):
-        return f"bits={self.quant_max.bit_length() + 1}, q_range=({self.quant_min}, {self.quant_max})"
+        mode = "Symmetric" if "symmetric" in str(self.qscheme) else "Asymmetric"
+        return f"bits={self.bits}, mode={mode}, q_range=({self.quant_min}, {self.quant_max}), dtype={self.dtype}"
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
index f06019f2a..5148684af 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
@@ -206,11 +206,19 @@ def __init__(self, config: Qwen3Config, layer_idx: int):
         self.k_rope_mul_0_output_qdq = ActivationQDQ(bits=16)
         self.k_rope_mul_1_output_qdq = ActivationQDQ(bits=16)
         self.k_rope_add_0_output_qdq = ActivationQDQ(bits=16)
-        self.k_cast_to_int8_qdq = ActivationQDQ(bits=8)
-        self.v_cast_to_int8_qdq = ActivationQDQ(bits=8)
+
+        # In qnn, is uint8 sym.
+        self.k_cast_to_int8_qdq = ActivationQDQ(
+            bits=8, qscheme=torch.per_tensor_symmetric
+        )
+        self.v_cast_to_int8_qdq = ActivationQDQ(
+            bits=8, qscheme=torch.per_tensor_symmetric
+        )
+
         self.v_cast_to_int16_qdq = ActivationQDQ(bits=16)
         self.qk_matmul_output_qdq = ActivationQDQ(bits=16)
         self.scaling_qdq = ActivationQDQ(bits=16)
+        self.neg_20_qdq = ActivationQDQ(bits=16)
         self.reduce_min_output_qdq = ActivationQDQ(bits=16)
         self.mul_0_output_qdq = ActivationQDQ(bits=16)
         self.minus_0_output_qdq = ActivationQDQ(bits=16)
@@ -281,7 +289,12 @@ def forward(
         attn_min = self.reduce_min_output_qdq(
             torch.amin(attn_weights, dim=-1, keepdim=True)
         )
-        attn_vv = self.minus_0_output_qdq(attn_min - 20)
+        attn_vv = self.minus_0_output_qdq(
+            attn_min
+            + self.neg_20_qdq(
+                torch.ones(1, dtype=torch.bfloat16, device=value_states.device) * (-20)
+            )
+        )
         attn_weights = torch.where(attention_mask == 0, attn_weights, attn_vv)
 
         attn_weights = self.softmax_output_qdq(
@@ -589,8 +602,8 @@ def __init__(self, config):
         super().__init__(config)
         self.model = Qwen3Model(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = QLinearW8A16_PerChannelSym(
-            config.hidden_size, config.vocab_size, bias=False
+        self.lm_head = QLinearLPBQ(
+            config.hidden_size, config.vocab_size, bias=False, block_size=32
         )
         self.mllm_qualcomm_max_length = None
 
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py
index 746970020..8432e4812 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/train.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py
@@ -1,4 +1,5 @@
 import os
+import torch
 import argparse
 from safetensors.torch import save_model
 from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer
@@ -39,6 +40,9 @@ def main():
     m.calibrate(num_samples=args.num_samples, max_seq_length=args.max_length)
     # m.compile()
     m.infer(args.infer_text)
+    m.model.lm_head.weight = torch.nn.Parameter(
+        m.model.model.embed_tokens.weight.clone()
+    )
 
     os.makedirs(args.output_dir, exist_ok=True)
     model_save_path = os.path.join(args.output_dir, "model.safetensors")
diff --git a/pymllm/quantize/pipeline.py b/pymllm/quantize/pipeline.py
index 71da013c6..288187fc5 100644
--- a/pymllm/quantize/pipeline.py
+++ b/pymllm/quantize/pipeline.py
@@ -20,9 +20,15 @@ def build_cast2fp32_pipeline() -> QuantizeSolver:
     return ret
 
 
+def build_raw_pipeline() -> QuantizeSolver:
+    ret = QuantizeSolver()
+    return ret
+
+
 BUILTIN_QUANTIZE_PIPELINE: Dict = {
     "w4a32_kai_pipeline": build_w4a32_kai_pipeline,
     "cast2fp32_pipeline": build_cast2fp32_pipeline,
+    "_raw": build_raw_pipeline,
 }
 BUILTIN_QUANTIZE_PASS: Dict = {
     "w4a32_kai": W4A32KAIQuantizePass,
diff --git a/pymllm/utils/mllm_convertor.py b/pymllm/utils/mllm_convertor.py
index 7b1aabfb0..d5e8a5c2f 100644
--- a/pymllm/utils/mllm_convertor.py
+++ b/pymllm/utils/mllm_convertor.py
@@ -66,6 +66,24 @@ def main():
             cast_left_2_fp32=True,
             verbose=args.verbose,
         )
+    elif args.cfg_path is None and args.pipeline is None and args.format == "v2":
+        cfg = None
+        pipeline: QuantizeSolver = BUILTIN_QUANTIZE_PIPELINE["_raw"]()
+        old_param_size = len(params)
+        new_param_size = pipeline.stream_quantize_params_size(cfg, params)
+        print(f"Params Num: Before: {old_param_size}, After: {new_param_size}")
+        pipeline.stream_quantize(
+            cfg,
+            params,
+            writer=ModelFileV2(
+                args.output_path,
+                args.model_name,
+                "Streaming",
+                max_params_descriptor_buffer_num=new_param_size,
+            ),
+            cast_left_2_fp32=False,
+            verbose=args.verbose,
+        )
     elif (
         args.cfg_path is not None and args.pipeline is not None and args.format == "v2"
     ):

From 6d7b5b98e4185614d360ec113c7e4adf62689af3 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Mon, 5 Jan 2026 03:31:46 +0000
Subject: [PATCH 09/13] fix: tensor attaching view selective hashing

---
 .../qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp   |   21 +-
 examples/qwen3_qnn_aot/qwen3_qnn_aot.mir      | 3921 +++++++++--------
 .../qwen3_qnn_aot_quant_recipe.mir            | 3508 +++++++--------
 mllm/backends/qnn/QNNUtils.hpp                |    4 +-
 mllm/backends/qnn/aot/passes/AOTPipeline.cpp  |    4 +-
 mllm/compile/ir/linalg/Attribute.cpp          |    2 +
 mllm/core/Tensor.cpp                          |   17 +-
 mllm/core/Tensor.hpp                          |    8 +-
 mllm/core/TensorViewImpl.hpp                  |    6 +-
 9 files changed, 3818 insertions(+), 3673 deletions(-)

diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
index 1f0da38e7..5677d27f2 100644
--- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
+++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -41,8 +41,8 @@ Tensor QDQ(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) {
     case kUInt16PerTensorAsy: {
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      in.attach("scale", scale.impl());
-      in.attach("zero_point", zp.impl());
+      in.attach("scale", scale.impl(), true);
+      in.attach("zero_point", zp.impl(), true);
       break;
     }
     // For Constant!
@@ -51,8 +51,8 @@ Tensor QDQ(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) {
       MLLM_RT_ASSERT_EQ(in.size(-1), 1);
       auto scale = m->getTopParameterFile()->pull(scale_name);
       auto zp = m->getTopParameterFile()->pull(zp_name);
-      in.attach("scale", scale.impl());
-      in.attach("zero_point", zp.impl());
+      in.attach("scale", scale.impl(), true);
+      in.attach("zero_point", zp.impl(), true);
       break;
     }
     default: {
@@ -76,8 +76,8 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
 
       // Is 128! not 127!
       auto new_zp = Tensor::constant(128, kInt32).setName(zp_name).setMemType(kParamsNormal);
-      in.attach("scale", scale.impl());
-      in.attach("zero_point", new_zp.impl());
+      in.attach("scale", scale.impl(), true);
+      in.attach("zero_point", new_zp.impl(), true);
       break;
     }
     default: {
@@ -372,7 +372,6 @@ class Qwen3Text final : public nn::Module {
 
     auto position_ids = inputs[1];
     auto causal_mask = inputs[2];
-    position_ids = position_ids.squeeze(0);
     auto llm_embedding_sin = rope_sin_()[{{0}, position_ids, {kAll}}];
     auto llm_embedding_cos = rope_cos_()[{{0}, position_ids, {kAll}}];
 
@@ -459,13 +458,13 @@ class Qwen3ForCausalLM : public ARGeneration, public nn::Module {
 
       // For decode phase, increment the last position
       if (seq_len == 1) {
-        auto last_pos = *position_ids.offsettedPtr<int32_t>({0, position_ids.shape()[1] - 1});
-        position_ids = Tensor::empty({batch_size, 1}, kInt32, kCPU).alloc();
-        *position_ids.offsettedPtr<int32_t>({0, 0}) = last_pos + 1;
+        auto last_pos = *position_ids.offsettedPtr<int32_t>({position_ids.shape()[1] - 1});
+        position_ids = Tensor::empty({1}, kInt32, kCPU).alloc();
+        *position_ids.offsettedPtr<int32_t>({0}) = last_pos + 1;
       }
     } else {
       // Generate position_ids for prefill phase
-      position_ids = Tensor::empty({batch_size, seq_len}, kInt32, kCPU).alloc();
+      position_ids = Tensor::empty({seq_len}, kInt32, kCPU).alloc();
       auto position_ids_ptr = position_ids.ptr<int32_t>();
       for (int s = 0; s < seq_len; ++s) { position_ids_ptr[s] = s; }
     }
diff --git a/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir b/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir
index 1caff3b4a..200c4982b 100644
--- a/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir
+++ b/examples/qwen3_qnn_aot/qwen3_qnn_aot.mir
@@ -1,319 +1,319 @@
 @main () -> () {
     graph.SubGraphOp @init <notype> [symbol:init] {
         () -> () {
-            tensor.CPU.register () -> (%7516:tensor<[151936, 2048], Float32, CPU>[@model.embed_tokens.weight][quant_recipe:QuantSpec(Raw(type: Float32), uuid=61), symbol:model.embed_tokens.weight])[symbol:model.embed_tokens.weight]
-            tensor.CPU.register () -> (%8011:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_sin][symbol:rope_sin])[symbol:rope_sin]
-            tensor.CPU.register () -> (%8012:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_cos][symbol:rope_cos])[symbol:rope_cos]
-            tensor.CPU.register () -> (%6662:tensor<[2048], Float32, CPU>[@model.layers.0.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), symbol:model.layers.0.input_layernorm.weight])[symbol:model.layers.0.input_layernorm.weight]
-            tensor.CPU.register () -> (%7778:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.q_proj.weight][symbol:model.layers.0.self_attn.q_proj.weight])[symbol:model.layers.0.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%61:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68), symbol:model.layers.0.self_attn.k_proj.weight])[symbol:model.layers.0.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%5178:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70), symbol:model.layers.0.self_attn.v_proj.weight])[symbol:model.layers.0.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%1867:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=74), symbol:model.layers.0.self_attn.q_norm.weight])[symbol:model.layers.0.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%7469:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=76), symbol:model.layers.0.self_attn.k_norm.weight])[symbol:model.layers.0.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%7880:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=89), symbol:model.layers.0.self_attn.o_proj.weight])[symbol:model.layers.0.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%3163:tensor<[2048], Float32, CPU>[@model.layers.0.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92), symbol:model.layers.0.post_attention_layernorm.weight])[symbol:model.layers.0.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%3038:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=93), symbol:model.layers.0.mlp.gate_proj.weight])[symbol:model.layers.0.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%184:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96), symbol:model.layers.0.mlp.up_proj.weight])[symbol:model.layers.0.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%7449:tensor<[2048, 6144], Float32, CPU>[@model.layers.0.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=98), symbol:model.layers.0.mlp.down_proj.weight])[symbol:model.layers.0.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%3526:tensor<[2048], Float32, CPU>[@model.layers.1.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101), symbol:model.layers.1.input_layernorm.weight])[symbol:model.layers.1.input_layernorm.weight]
-            tensor.CPU.register () -> (%2471:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.q_proj.weight][symbol:model.layers.1.self_attn.q_proj.weight])[symbol:model.layers.1.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%5492:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=102), symbol:model.layers.1.self_attn.k_proj.weight])[symbol:model.layers.1.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%554:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=104), symbol:model.layers.1.self_attn.v_proj.weight])[symbol:model.layers.1.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%5159:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=108), symbol:model.layers.1.self_attn.q_norm.weight])[symbol:model.layers.1.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%6337:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=110), symbol:model.layers.1.self_attn.k_norm.weight])[symbol:model.layers.1.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%3431:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=123), symbol:model.layers.1.self_attn.o_proj.weight])[symbol:model.layers.1.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%7183:tensor<[2048], Float32, CPU>[@model.layers.1.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=126), symbol:model.layers.1.post_attention_layernorm.weight])[symbol:model.layers.1.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%6960:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=127), symbol:model.layers.1.mlp.gate_proj.weight])[symbol:model.layers.1.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%7251:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=130), symbol:model.layers.1.mlp.up_proj.weight])[symbol:model.layers.1.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%6256:tensor<[2048, 6144], Float32, CPU>[@model.layers.1.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=132), symbol:model.layers.1.mlp.down_proj.weight])[symbol:model.layers.1.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%7411:tensor<[2048], Float32, CPU>[@model.layers.2.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=135), symbol:model.layers.2.input_layernorm.weight])[symbol:model.layers.2.input_layernorm.weight]
-            tensor.CPU.register () -> (%4879:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.q_proj.weight][symbol:model.layers.2.self_attn.q_proj.weight])[symbol:model.layers.2.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%725:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=136), symbol:model.layers.2.self_attn.k_proj.weight])[symbol:model.layers.2.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%2701:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138), symbol:model.layers.2.self_attn.v_proj.weight])[symbol:model.layers.2.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%7660:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=142), symbol:model.layers.2.self_attn.q_norm.weight])[symbol:model.layers.2.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5749:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), symbol:model.layers.2.self_attn.k_norm.weight])[symbol:model.layers.2.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%1525:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=157), symbol:model.layers.2.self_attn.o_proj.weight])[symbol:model.layers.2.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6444:tensor<[2048], Float32, CPU>[@model.layers.2.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=160), symbol:model.layers.2.post_attention_layernorm.weight])[symbol:model.layers.2.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%3201:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=161), symbol:model.layers.2.mlp.gate_proj.weight])[symbol:model.layers.2.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%4120:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164), symbol:model.layers.2.mlp.up_proj.weight])[symbol:model.layers.2.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%1962:tensor<[2048, 6144], Float32, CPU>[@model.layers.2.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166), symbol:model.layers.2.mlp.down_proj.weight])[symbol:model.layers.2.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%3250:tensor<[2048], Float32, CPU>[@model.layers.3.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), symbol:model.layers.3.input_layernorm.weight])[symbol:model.layers.3.input_layernorm.weight]
-            tensor.CPU.register () -> (%5564:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.q_proj.weight][symbol:model.layers.3.self_attn.q_proj.weight])[symbol:model.layers.3.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%3502:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=170), symbol:model.layers.3.self_attn.k_proj.weight])[symbol:model.layers.3.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%2402:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=172), symbol:model.layers.3.self_attn.v_proj.weight])[symbol:model.layers.3.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%1747:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=176), symbol:model.layers.3.self_attn.q_norm.weight])[symbol:model.layers.3.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%4846:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=178), symbol:model.layers.3.self_attn.k_norm.weight])[symbol:model.layers.3.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%3109:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=191), symbol:model.layers.3.self_attn.o_proj.weight])[symbol:model.layers.3.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%7221:tensor<[2048], Float32, CPU>[@model.layers.3.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=194), symbol:model.layers.3.post_attention_layernorm.weight])[symbol:model.layers.3.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%7181:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195), symbol:model.layers.3.mlp.gate_proj.weight])[symbol:model.layers.3.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%2714:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=198), symbol:model.layers.3.mlp.up_proj.weight])[symbol:model.layers.3.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%4573:tensor<[2048, 6144], Float32, CPU>[@model.layers.3.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=200), symbol:model.layers.3.mlp.down_proj.weight])[symbol:model.layers.3.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%5536:tensor<[2048], Float32, CPU>[@model.layers.4.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203), symbol:model.layers.4.input_layernorm.weight])[symbol:model.layers.4.input_layernorm.weight]
-            tensor.CPU.register () -> (%463:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.q_proj.weight][symbol:model.layers.4.self_attn.q_proj.weight])[symbol:model.layers.4.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%5989:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204), symbol:model.layers.4.self_attn.k_proj.weight])[symbol:model.layers.4.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%3443:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=206), symbol:model.layers.4.self_attn.v_proj.weight])[symbol:model.layers.4.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%926:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=210), symbol:model.layers.4.self_attn.q_norm.weight])[symbol:model.layers.4.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5648:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212), symbol:model.layers.4.self_attn.k_norm.weight])[symbol:model.layers.4.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%256:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=225), symbol:model.layers.4.self_attn.o_proj.weight])[symbol:model.layers.4.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%3101:tensor<[2048], Float32, CPU>[@model.layers.4.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=228), symbol:model.layers.4.post_attention_layernorm.weight])[symbol:model.layers.4.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%15:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=229), symbol:model.layers.4.mlp.gate_proj.weight])[symbol:model.layers.4.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%3494:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=232), symbol:model.layers.4.mlp.up_proj.weight])[symbol:model.layers.4.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%6518:tensor<[2048, 6144], Float32, CPU>[@model.layers.4.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234), symbol:model.layers.4.mlp.down_proj.weight])[symbol:model.layers.4.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%7246:tensor<[2048], Float32, CPU>[@model.layers.5.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237), symbol:model.layers.5.input_layernorm.weight])[symbol:model.layers.5.input_layernorm.weight]
-            tensor.CPU.register () -> (%3752:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.q_proj.weight][symbol:model.layers.5.self_attn.q_proj.weight])[symbol:model.layers.5.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2143:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=238), symbol:model.layers.5.self_attn.k_proj.weight])[symbol:model.layers.5.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%5753:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=240), symbol:model.layers.5.self_attn.v_proj.weight])[symbol:model.layers.5.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%4774:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=244), symbol:model.layers.5.self_attn.q_norm.weight])[symbol:model.layers.5.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%1215:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=246), symbol:model.layers.5.self_attn.k_norm.weight])[symbol:model.layers.5.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%2076:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=259), symbol:model.layers.5.self_attn.o_proj.weight])[symbol:model.layers.5.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6883:tensor<[2048], Float32, CPU>[@model.layers.5.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=262), symbol:model.layers.5.post_attention_layernorm.weight])[symbol:model.layers.5.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%5485:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=263), symbol:model.layers.5.mlp.gate_proj.weight])[symbol:model.layers.5.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%759:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=266), symbol:model.layers.5.mlp.up_proj.weight])[symbol:model.layers.5.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%6315:tensor<[2048, 6144], Float32, CPU>[@model.layers.5.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=268), symbol:model.layers.5.mlp.down_proj.weight])[symbol:model.layers.5.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%7090:tensor<[2048], Float32, CPU>[@model.layers.6.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=271), symbol:model.layers.6.input_layernorm.weight])[symbol:model.layers.6.input_layernorm.weight]
-            tensor.CPU.register () -> (%3125:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.q_proj.weight][symbol:model.layers.6.self_attn.q_proj.weight])[symbol:model.layers.6.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%1798:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=272), symbol:model.layers.6.self_attn.k_proj.weight])[symbol:model.layers.6.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%1047:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274), symbol:model.layers.6.self_attn.v_proj.weight])[symbol:model.layers.6.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%7385:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=278), symbol:model.layers.6.self_attn.q_norm.weight])[symbol:model.layers.6.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5603:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=280), symbol:model.layers.6.self_attn.k_norm.weight])[symbol:model.layers.6.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%6862:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=293), symbol:model.layers.6.self_attn.o_proj.weight])[symbol:model.layers.6.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%4161:tensor<[2048], Float32, CPU>[@model.layers.6.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), symbol:model.layers.6.post_attention_layernorm.weight])[symbol:model.layers.6.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%5295:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=297), symbol:model.layers.6.mlp.gate_proj.weight])[symbol:model.layers.6.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%4710:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300), symbol:model.layers.6.mlp.up_proj.weight])[symbol:model.layers.6.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%4929:tensor<[2048, 6144], Float32, CPU>[@model.layers.6.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=302), symbol:model.layers.6.mlp.down_proj.weight])[symbol:model.layers.6.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%4605:tensor<[2048], Float32, CPU>[@model.layers.7.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305), symbol:model.layers.7.input_layernorm.weight])[symbol:model.layers.7.input_layernorm.weight]
-            tensor.CPU.register () -> (%4585:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.q_proj.weight][symbol:model.layers.7.self_attn.q_proj.weight])[symbol:model.layers.7.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%1:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306), symbol:model.layers.7.self_attn.k_proj.weight])[symbol:model.layers.7.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%2341:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=308), symbol:model.layers.7.self_attn.v_proj.weight])[symbol:model.layers.7.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%5151:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=312), symbol:model.layers.7.self_attn.q_norm.weight])[symbol:model.layers.7.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%3437:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=314), symbol:model.layers.7.self_attn.k_norm.weight])[symbol:model.layers.7.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%3368:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=327), symbol:model.layers.7.self_attn.o_proj.weight])[symbol:model.layers.7.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%68:tensor<[2048], Float32, CPU>[@model.layers.7.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330), symbol:model.layers.7.post_attention_layernorm.weight])[symbol:model.layers.7.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%324:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=331), symbol:model.layers.7.mlp.gate_proj.weight])[symbol:model.layers.7.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%5551:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=334), symbol:model.layers.7.mlp.up_proj.weight])[symbol:model.layers.7.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%7894:tensor<[2048, 6144], Float32, CPU>[@model.layers.7.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=336), symbol:model.layers.7.mlp.down_proj.weight])[symbol:model.layers.7.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%3851:tensor<[2048], Float32, CPU>[@model.layers.8.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), symbol:model.layers.8.input_layernorm.weight])[symbol:model.layers.8.input_layernorm.weight]
-            tensor.CPU.register () -> (%5874:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.q_proj.weight][symbol:model.layers.8.self_attn.q_proj.weight])[symbol:model.layers.8.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%1863:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=340), symbol:model.layers.8.self_attn.k_proj.weight])[symbol:model.layers.8.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%3204:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=342), symbol:model.layers.8.self_attn.v_proj.weight])[symbol:model.layers.8.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2301:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=346), symbol:model.layers.8.self_attn.q_norm.weight])[symbol:model.layers.8.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%7373:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=348), symbol:model.layers.8.self_attn.k_norm.weight])[symbol:model.layers.8.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%6303:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=361), symbol:model.layers.8.self_attn.o_proj.weight])[symbol:model.layers.8.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%1997:tensor<[2048], Float32, CPU>[@model.layers.8.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), symbol:model.layers.8.post_attention_layernorm.weight])[symbol:model.layers.8.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%6731:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=365), symbol:model.layers.8.mlp.gate_proj.weight])[symbol:model.layers.8.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%5478:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=368), symbol:model.layers.8.mlp.up_proj.weight])[symbol:model.layers.8.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%4734:tensor<[2048, 6144], Float32, CPU>[@model.layers.8.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370), symbol:model.layers.8.mlp.down_proj.weight])[symbol:model.layers.8.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%4963:tensor<[2048], Float32, CPU>[@model.layers.9.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=373), symbol:model.layers.9.input_layernorm.weight])[symbol:model.layers.9.input_layernorm.weight]
-            tensor.CPU.register () -> (%137:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.q_proj.weight][symbol:model.layers.9.self_attn.q_proj.weight])[symbol:model.layers.9.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2689:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374), symbol:model.layers.9.self_attn.k_proj.weight])[symbol:model.layers.9.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%4027:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376), symbol:model.layers.9.self_attn.v_proj.weight])[symbol:model.layers.9.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%1375:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=380), symbol:model.layers.9.self_attn.q_norm.weight])[symbol:model.layers.9.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%4962:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=382), symbol:model.layers.9.self_attn.k_norm.weight])[symbol:model.layers.9.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%6399:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=395), symbol:model.layers.9.self_attn.o_proj.weight])[symbol:model.layers.9.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%2594:tensor<[2048], Float32, CPU>[@model.layers.9.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=398), symbol:model.layers.9.post_attention_layernorm.weight])[symbol:model.layers.9.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%3833:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=399), symbol:model.layers.9.mlp.gate_proj.weight])[symbol:model.layers.9.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%2358:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=402), symbol:model.layers.9.mlp.up_proj.weight])[symbol:model.layers.9.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%3947:tensor<[2048, 6144], Float32, CPU>[@model.layers.9.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=404), symbol:model.layers.9.mlp.down_proj.weight])[symbol:model.layers.9.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%3229:tensor<[2048], Float32, CPU>[@model.layers.10.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), symbol:model.layers.10.input_layernorm.weight])[symbol:model.layers.10.input_layernorm.weight]
-            tensor.CPU.register () -> (%5022:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.q_proj.weight][symbol:model.layers.10.self_attn.q_proj.weight])[symbol:model.layers.10.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2867:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=408), symbol:model.layers.10.self_attn.k_proj.weight])[symbol:model.layers.10.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%567:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=410), symbol:model.layers.10.self_attn.v_proj.weight])[symbol:model.layers.10.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%7008:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=414), symbol:model.layers.10.self_attn.q_norm.weight])[symbol:model.layers.10.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%6953:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), symbol:model.layers.10.self_attn.k_norm.weight])[symbol:model.layers.10.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%5479:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=429), symbol:model.layers.10.self_attn.o_proj.weight])[symbol:model.layers.10.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%3177:tensor<[2048], Float32, CPU>[@model.layers.10.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432), symbol:model.layers.10.post_attention_layernorm.weight])[symbol:model.layers.10.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%7857:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=433), symbol:model.layers.10.mlp.gate_proj.weight])[symbol:model.layers.10.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%3620:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=436), symbol:model.layers.10.mlp.up_proj.weight])[symbol:model.layers.10.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%4172:tensor<[2048, 6144], Float32, CPU>[@model.layers.10.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=438), symbol:model.layers.10.mlp.down_proj.weight])[symbol:model.layers.10.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%1820:tensor<[2048], Float32, CPU>[@model.layers.11.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=441), symbol:model.layers.11.input_layernorm.weight])[symbol:model.layers.11.input_layernorm.weight]
-            tensor.CPU.register () -> (%4375:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.q_proj.weight][symbol:model.layers.11.self_attn.q_proj.weight])[symbol:model.layers.11.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%3805:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=442), symbol:model.layers.11.self_attn.k_proj.weight])[symbol:model.layers.11.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%5348:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444), symbol:model.layers.11.self_attn.v_proj.weight])[symbol:model.layers.11.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%1018:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=448), symbol:model.layers.11.self_attn.q_norm.weight])[symbol:model.layers.11.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5323:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450), symbol:model.layers.11.self_attn.k_norm.weight])[symbol:model.layers.11.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%6587:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=463), symbol:model.layers.11.self_attn.o_proj.weight])[symbol:model.layers.11.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%2072:tensor<[2048], Float32, CPU>[@model.layers.11.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=466), symbol:model.layers.11.post_attention_layernorm.weight])[symbol:model.layers.11.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%5180:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=467), symbol:model.layers.11.mlp.gate_proj.weight])[symbol:model.layers.11.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%1917:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=470), symbol:model.layers.11.mlp.up_proj.weight])[symbol:model.layers.11.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%2810:tensor<[2048, 6144], Float32, CPU>[@model.layers.11.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=472), symbol:model.layers.11.mlp.down_proj.weight])[symbol:model.layers.11.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%4945:tensor<[2048], Float32, CPU>[@model.layers.12.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=475), symbol:model.layers.12.input_layernorm.weight])[symbol:model.layers.12.input_layernorm.weight]
-            tensor.CPU.register () -> (%6926:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.q_proj.weight][symbol:model.layers.12.self_attn.q_proj.weight])[symbol:model.layers.12.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2741:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=476), symbol:model.layers.12.self_attn.k_proj.weight])[symbol:model.layers.12.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%3690:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=478), symbol:model.layers.12.self_attn.v_proj.weight])[symbol:model.layers.12.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%5447:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=482), symbol:model.layers.12.self_attn.q_norm.weight])[symbol:model.layers.12.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5437:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), symbol:model.layers.12.self_attn.k_norm.weight])[symbol:model.layers.12.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%4785:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=497), symbol:model.layers.12.self_attn.o_proj.weight])[symbol:model.layers.12.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%1343:tensor<[2048], Float32, CPU>[@model.layers.12.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=500), symbol:model.layers.12.post_attention_layernorm.weight])[symbol:model.layers.12.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%3306:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=501), symbol:model.layers.12.mlp.gate_proj.weight])[symbol:model.layers.12.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%2123:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=504), symbol:model.layers.12.mlp.up_proj.weight])[symbol:model.layers.12.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%2005:tensor<[2048, 6144], Float32, CPU>[@model.layers.12.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=506), symbol:model.layers.12.mlp.down_proj.weight])[symbol:model.layers.12.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%1812:tensor<[2048], Float32, CPU>[@model.layers.13.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509), symbol:model.layers.13.input_layernorm.weight])[symbol:model.layers.13.input_layernorm.weight]
-            tensor.CPU.register () -> (%7043:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.q_proj.weight][symbol:model.layers.13.self_attn.q_proj.weight])[symbol:model.layers.13.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%229:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510), symbol:model.layers.13.self_attn.k_proj.weight])[symbol:model.layers.13.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%1019:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=512), symbol:model.layers.13.self_attn.v_proj.weight])[symbol:model.layers.13.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%3318:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=516), symbol:model.layers.13.self_attn.q_norm.weight])[symbol:model.layers.13.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%2503:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=518), symbol:model.layers.13.self_attn.k_norm.weight])[symbol:model.layers.13.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%3883:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=531), symbol:model.layers.13.self_attn.o_proj.weight])[symbol:model.layers.13.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6904:tensor<[2048], Float32, CPU>[@model.layers.13.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), symbol:model.layers.13.post_attention_layernorm.weight])[symbol:model.layers.13.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%5444:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=535), symbol:model.layers.13.mlp.gate_proj.weight])[symbol:model.layers.13.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%3100:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=538), symbol:model.layers.13.mlp.up_proj.weight])[symbol:model.layers.13.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%6631:tensor<[2048, 6144], Float32, CPU>[@model.layers.13.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=540), symbol:model.layers.13.mlp.down_proj.weight])[symbol:model.layers.13.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%5555:tensor<[2048], Float32, CPU>[@model.layers.14.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=543), symbol:model.layers.14.input_layernorm.weight])[symbol:model.layers.14.input_layernorm.weight]
-            tensor.CPU.register () -> (%1210:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.q_proj.weight][symbol:model.layers.14.self_attn.q_proj.weight])[symbol:model.layers.14.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%3756:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=544), symbol:model.layers.14.self_attn.k_proj.weight])[symbol:model.layers.14.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%5243:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=546), symbol:model.layers.14.self_attn.v_proj.weight])[symbol:model.layers.14.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%3796:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550), symbol:model.layers.14.self_attn.q_norm.weight])[symbol:model.layers.14.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%3974:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552), symbol:model.layers.14.self_attn.k_norm.weight])[symbol:model.layers.14.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%3797:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=565), symbol:model.layers.14.self_attn.o_proj.weight])[symbol:model.layers.14.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%4508:tensor<[2048], Float32, CPU>[@model.layers.14.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=568), symbol:model.layers.14.post_attention_layernorm.weight])[symbol:model.layers.14.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%7092:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=569), symbol:model.layers.14.mlp.gate_proj.weight])[symbol:model.layers.14.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%7164:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=572), symbol:model.layers.14.mlp.up_proj.weight])[symbol:model.layers.14.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%4419:tensor<[2048, 6144], Float32, CPU>[@model.layers.14.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=574), symbol:model.layers.14.mlp.down_proj.weight])[symbol:model.layers.14.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%5590:tensor<[2048], Float32, CPU>[@model.layers.15.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), symbol:model.layers.15.input_layernorm.weight])[symbol:model.layers.15.input_layernorm.weight]
-            tensor.CPU.register () -> (%5843:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.q_proj.weight][symbol:model.layers.15.self_attn.q_proj.weight])[symbol:model.layers.15.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%938:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=578), symbol:model.layers.15.self_attn.k_proj.weight])[symbol:model.layers.15.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%3967:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580), symbol:model.layers.15.self_attn.v_proj.weight])[symbol:model.layers.15.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%3289:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=584), symbol:model.layers.15.self_attn.q_norm.weight])[symbol:model.layers.15.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%6756:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=586), symbol:model.layers.15.self_attn.k_norm.weight])[symbol:model.layers.15.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%4838:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=599), symbol:model.layers.15.self_attn.o_proj.weight])[symbol:model.layers.15.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6774:tensor<[2048], Float32, CPU>[@model.layers.15.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602), symbol:model.layers.15.post_attention_layernorm.weight])[symbol:model.layers.15.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%2819:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=603), symbol:model.layers.15.mlp.gate_proj.weight])[symbol:model.layers.15.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%1377:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=606), symbol:model.layers.15.mlp.up_proj.weight])[symbol:model.layers.15.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%526:tensor<[2048, 6144], Float32, CPU>[@model.layers.15.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=608), symbol:model.layers.15.mlp.down_proj.weight])[symbol:model.layers.15.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%369:tensor<[2048], Float32, CPU>[@model.layers.16.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611), symbol:model.layers.16.input_layernorm.weight])[symbol:model.layers.16.input_layernorm.weight]
-            tensor.CPU.register () -> (%2345:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.q_proj.weight][symbol:model.layers.16.self_attn.q_proj.weight])[symbol:model.layers.16.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%3022:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=612), symbol:model.layers.16.self_attn.k_proj.weight])[symbol:model.layers.16.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%2931:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=614), symbol:model.layers.16.self_attn.v_proj.weight])[symbol:model.layers.16.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%1150:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=618), symbol:model.layers.16.self_attn.q_norm.weight])[symbol:model.layers.16.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5521:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=620), symbol:model.layers.16.self_attn.k_norm.weight])[symbol:model.layers.16.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%672:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=633), symbol:model.layers.16.self_attn.o_proj.weight])[symbol:model.layers.16.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6793:tensor<[2048], Float32, CPU>[@model.layers.16.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=636), symbol:model.layers.16.post_attention_layernorm.weight])[symbol:model.layers.16.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%993:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=637), symbol:model.layers.16.mlp.gate_proj.weight])[symbol:model.layers.16.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%226:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=640), symbol:model.layers.16.mlp.up_proj.weight])[symbol:model.layers.16.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%7287:tensor<[2048, 6144], Float32, CPU>[@model.layers.16.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=642), symbol:model.layers.16.mlp.down_proj.weight])[symbol:model.layers.16.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%7811:tensor<[2048], Float32, CPU>[@model.layers.17.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=645), symbol:model.layers.17.input_layernorm.weight])[symbol:model.layers.17.input_layernorm.weight]
-            tensor.CPU.register () -> (%5758:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.q_proj.weight][symbol:model.layers.17.self_attn.q_proj.weight])[symbol:model.layers.17.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2828:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=646), symbol:model.layers.17.self_attn.k_proj.weight])[symbol:model.layers.17.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%417:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=648), symbol:model.layers.17.self_attn.v_proj.weight])[symbol:model.layers.17.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%59:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=652), symbol:model.layers.17.self_attn.q_norm.weight])[symbol:model.layers.17.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%7588:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), symbol:model.layers.17.self_attn.k_norm.weight])[symbol:model.layers.17.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%5285:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667), symbol:model.layers.17.self_attn.o_proj.weight])[symbol:model.layers.17.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%3787:tensor<[2048], Float32, CPU>[@model.layers.17.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=670), symbol:model.layers.17.post_attention_layernorm.weight])[symbol:model.layers.17.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%4841:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=671), symbol:model.layers.17.mlp.gate_proj.weight])[symbol:model.layers.17.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%4784:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=674), symbol:model.layers.17.mlp.up_proj.weight])[symbol:model.layers.17.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%1908:tensor<[2048, 6144], Float32, CPU>[@model.layers.17.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=676), symbol:model.layers.17.mlp.down_proj.weight])[symbol:model.layers.17.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%310:tensor<[2048], Float32, CPU>[@model.layers.18.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), symbol:model.layers.18.input_layernorm.weight])[symbol:model.layers.18.input_layernorm.weight]
-            tensor.CPU.register () -> (%7352:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.q_proj.weight][symbol:model.layers.18.self_attn.q_proj.weight])[symbol:model.layers.18.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%6436:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=680), symbol:model.layers.18.self_attn.k_proj.weight])[symbol:model.layers.18.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%6164:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=682), symbol:model.layers.18.self_attn.v_proj.weight])[symbol:model.layers.18.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2747:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=686), symbol:model.layers.18.self_attn.q_norm.weight])[symbol:model.layers.18.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%5281:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=688), symbol:model.layers.18.self_attn.k_norm.weight])[symbol:model.layers.18.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%7646:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=701), symbol:model.layers.18.self_attn.o_proj.weight])[symbol:model.layers.18.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%2540:tensor<[2048], Float32, CPU>[@model.layers.18.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=704), symbol:model.layers.18.post_attention_layernorm.weight])[symbol:model.layers.18.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%6101:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=705), symbol:model.layers.18.mlp.gate_proj.weight])[symbol:model.layers.18.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%2195:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=708), symbol:model.layers.18.mlp.up_proj.weight])[symbol:model.layers.18.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%3651:tensor<[2048, 6144], Float32, CPU>[@model.layers.18.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=710), symbol:model.layers.18.mlp.down_proj.weight])[symbol:model.layers.18.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%3722:tensor<[2048], Float32, CPU>[@model.layers.19.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713), symbol:model.layers.19.input_layernorm.weight])[symbol:model.layers.19.input_layernorm.weight]
-            tensor.CPU.register () -> (%1141:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.q_proj.weight][symbol:model.layers.19.self_attn.q_proj.weight])[symbol:model.layers.19.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%651:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=714), symbol:model.layers.19.self_attn.k_proj.weight])[symbol:model.layers.19.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%254:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=716), symbol:model.layers.19.self_attn.v_proj.weight])[symbol:model.layers.19.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%610:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=720), symbol:model.layers.19.self_attn.q_norm.weight])[symbol:model.layers.19.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%3691:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722), symbol:model.layers.19.self_attn.k_norm.weight])[symbol:model.layers.19.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%7002:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735), symbol:model.layers.19.self_attn.o_proj.weight])[symbol:model.layers.19.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%3446:tensor<[2048], Float32, CPU>[@model.layers.19.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=738), symbol:model.layers.19.post_attention_layernorm.weight])[symbol:model.layers.19.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%2118:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=739), symbol:model.layers.19.mlp.gate_proj.weight])[symbol:model.layers.19.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%283:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=742), symbol:model.layers.19.mlp.up_proj.weight])[symbol:model.layers.19.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%1264:tensor<[2048, 6144], Float32, CPU>[@model.layers.19.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=744), symbol:model.layers.19.mlp.down_proj.weight])[symbol:model.layers.19.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%5183:tensor<[2048], Float32, CPU>[@model.layers.20.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747), symbol:model.layers.20.input_layernorm.weight])[symbol:model.layers.20.input_layernorm.weight]
-            tensor.CPU.register () -> (%6004:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.q_proj.weight][symbol:model.layers.20.self_attn.q_proj.weight])[symbol:model.layers.20.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%4764:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=748), symbol:model.layers.20.self_attn.k_proj.weight])[symbol:model.layers.20.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%3516:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=750), symbol:model.layers.20.self_attn.v_proj.weight])[symbol:model.layers.20.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2042:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=754), symbol:model.layers.20.self_attn.q_norm.weight])[symbol:model.layers.20.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%1646:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=756), symbol:model.layers.20.self_attn.k_norm.weight])[symbol:model.layers.20.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%3587:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=769), symbol:model.layers.20.self_attn.o_proj.weight])[symbol:model.layers.20.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%2726:tensor<[2048], Float32, CPU>[@model.layers.20.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=772), symbol:model.layers.20.post_attention_layernorm.weight])[symbol:model.layers.20.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%3656:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=773), symbol:model.layers.20.mlp.gate_proj.weight])[symbol:model.layers.20.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%802:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=776), symbol:model.layers.20.mlp.up_proj.weight])[symbol:model.layers.20.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%62:tensor<[2048, 6144], Float32, CPU>[@model.layers.20.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=778), symbol:model.layers.20.mlp.down_proj.weight])[symbol:model.layers.20.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%1237:tensor<[2048], Float32, CPU>[@model.layers.21.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=781), symbol:model.layers.21.input_layernorm.weight])[symbol:model.layers.21.input_layernorm.weight]
-            tensor.CPU.register () -> (%2397:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.q_proj.weight][symbol:model.layers.21.self_attn.q_proj.weight])[symbol:model.layers.21.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%7562:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=782), symbol:model.layers.21.self_attn.k_proj.weight])[symbol:model.layers.21.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%4665:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=784), symbol:model.layers.21.self_attn.v_proj.weight])[symbol:model.layers.21.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%6195:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=788), symbol:model.layers.21.self_attn.q_norm.weight])[symbol:model.layers.21.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%701:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=790), symbol:model.layers.21.self_attn.k_norm.weight])[symbol:model.layers.21.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%5913:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803), symbol:model.layers.21.self_attn.o_proj.weight])[symbol:model.layers.21.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%4765:tensor<[2048], Float32, CPU>[@model.layers.21.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), symbol:model.layers.21.post_attention_layernorm.weight])[symbol:model.layers.21.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%864:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807), symbol:model.layers.21.mlp.gate_proj.weight])[symbol:model.layers.21.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%923:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=810), symbol:model.layers.21.mlp.up_proj.weight])[symbol:model.layers.21.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%6934:tensor<[2048, 6144], Float32, CPU>[@model.layers.21.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=812), symbol:model.layers.21.mlp.down_proj.weight])[symbol:model.layers.21.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%425:tensor<[2048], Float32, CPU>[@model.layers.22.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815), symbol:model.layers.22.input_layernorm.weight])[symbol:model.layers.22.input_layernorm.weight]
-            tensor.CPU.register () -> (%1036:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.q_proj.weight][symbol:model.layers.22.self_attn.q_proj.weight])[symbol:model.layers.22.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%6990:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=816), symbol:model.layers.22.self_attn.k_proj.weight])[symbol:model.layers.22.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%2703:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=818), symbol:model.layers.22.self_attn.v_proj.weight])[symbol:model.layers.22.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%1995:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=822), symbol:model.layers.22.self_attn.q_norm.weight])[symbol:model.layers.22.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%2702:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=824), symbol:model.layers.22.self_attn.k_norm.weight])[symbol:model.layers.22.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%2221:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=837), symbol:model.layers.22.self_attn.o_proj.weight])[symbol:model.layers.22.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%5286:tensor<[2048], Float32, CPU>[@model.layers.22.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840), symbol:model.layers.22.post_attention_layernorm.weight])[symbol:model.layers.22.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%7377:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=841), symbol:model.layers.22.mlp.gate_proj.weight])[symbol:model.layers.22.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%694:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=844), symbol:model.layers.22.mlp.up_proj.weight])[symbol:model.layers.22.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%1401:tensor<[2048, 6144], Float32, CPU>[@model.layers.22.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=846), symbol:model.layers.22.mlp.down_proj.weight])[symbol:model.layers.22.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%809:tensor<[2048], Float32, CPU>[@model.layers.23.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), symbol:model.layers.23.input_layernorm.weight])[symbol:model.layers.23.input_layernorm.weight]
-            tensor.CPU.register () -> (%2936:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.q_proj.weight][symbol:model.layers.23.self_attn.q_proj.weight])[symbol:model.layers.23.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%577:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=850), symbol:model.layers.23.self_attn.k_proj.weight])[symbol:model.layers.23.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%5308:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=852), symbol:model.layers.23.self_attn.v_proj.weight])[symbol:model.layers.23.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%5454:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=856), symbol:model.layers.23.self_attn.q_norm.weight])[symbol:model.layers.23.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%1089:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=858), symbol:model.layers.23.self_attn.k_norm.weight])[symbol:model.layers.23.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%4076:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=871), symbol:model.layers.23.self_attn.o_proj.weight])[symbol:model.layers.23.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%4535:tensor<[2048], Float32, CPU>[@model.layers.23.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), symbol:model.layers.23.post_attention_layernorm.weight])[symbol:model.layers.23.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%7750:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875), symbol:model.layers.23.mlp.gate_proj.weight])[symbol:model.layers.23.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%4744:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=878), symbol:model.layers.23.mlp.up_proj.weight])[symbol:model.layers.23.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%2933:tensor<[2048, 6144], Float32, CPU>[@model.layers.23.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=880), symbol:model.layers.23.mlp.down_proj.weight])[symbol:model.layers.23.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%1154:tensor<[2048], Float32, CPU>[@model.layers.24.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=883), symbol:model.layers.24.input_layernorm.weight])[symbol:model.layers.24.input_layernorm.weight]
-            tensor.CPU.register () -> (%2384:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.q_proj.weight][symbol:model.layers.24.self_attn.q_proj.weight])[symbol:model.layers.24.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2620:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=884), symbol:model.layers.24.self_attn.k_proj.weight])[symbol:model.layers.24.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%3265:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=886), symbol:model.layers.24.self_attn.v_proj.weight])[symbol:model.layers.24.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2985:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=890), symbol:model.layers.24.self_attn.q_norm.weight])[symbol:model.layers.24.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%3894:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=892), symbol:model.layers.24.self_attn.k_norm.weight])[symbol:model.layers.24.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%7488:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=905), symbol:model.layers.24.self_attn.o_proj.weight])[symbol:model.layers.24.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6713:tensor<[2048], Float32, CPU>[@model.layers.24.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=908), symbol:model.layers.24.post_attention_layernorm.weight])[symbol:model.layers.24.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%1336:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=909), symbol:model.layers.24.mlp.gate_proj.weight])[symbol:model.layers.24.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%7035:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912), symbol:model.layers.24.mlp.up_proj.weight])[symbol:model.layers.24.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%7069:tensor<[2048, 6144], Float32, CPU>[@model.layers.24.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=914), symbol:model.layers.24.mlp.down_proj.weight])[symbol:model.layers.24.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%6496:tensor<[2048], Float32, CPU>[@model.layers.25.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=917), symbol:model.layers.25.input_layernorm.weight])[symbol:model.layers.25.input_layernorm.weight]
-            tensor.CPU.register () -> (%1852:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.q_proj.weight][symbol:model.layers.25.self_attn.q_proj.weight])[symbol:model.layers.25.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%3615:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=918), symbol:model.layers.25.self_attn.k_proj.weight])[symbol:model.layers.25.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%2014:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=920), symbol:model.layers.25.self_attn.v_proj.weight])[symbol:model.layers.25.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2021:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=924), symbol:model.layers.25.self_attn.q_norm.weight])[symbol:model.layers.25.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%1413:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=926), symbol:model.layers.25.self_attn.k_norm.weight])[symbol:model.layers.25.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%7074:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939), symbol:model.layers.25.self_attn.o_proj.weight])[symbol:model.layers.25.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%6424:tensor<[2048], Float32, CPU>[@model.layers.25.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=942), symbol:model.layers.25.post_attention_layernorm.weight])[symbol:model.layers.25.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%1860:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943), symbol:model.layers.25.mlp.gate_proj.weight])[symbol:model.layers.25.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%5840:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=946), symbol:model.layers.25.mlp.up_proj.weight])[symbol:model.layers.25.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%6869:tensor<[2048, 6144], Float32, CPU>[@model.layers.25.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=948), symbol:model.layers.25.mlp.down_proj.weight])[symbol:model.layers.25.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%611:tensor<[2048], Float32, CPU>[@model.layers.26.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=951), symbol:model.layers.26.input_layernorm.weight])[symbol:model.layers.26.input_layernorm.weight]
-            tensor.CPU.register () -> (%1040:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.q_proj.weight][symbol:model.layers.26.self_attn.q_proj.weight])[symbol:model.layers.26.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%2312:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=952), symbol:model.layers.26.self_attn.k_proj.weight])[symbol:model.layers.26.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%174:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=954), symbol:model.layers.26.self_attn.v_proj.weight])[symbol:model.layers.26.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2799:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=958), symbol:model.layers.26.self_attn.q_norm.weight])[symbol:model.layers.26.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%6479:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=960), symbol:model.layers.26.self_attn.k_norm.weight])[symbol:model.layers.26.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%504:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=973), symbol:model.layers.26.self_attn.o_proj.weight])[symbol:model.layers.26.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%5096:tensor<[2048], Float32, CPU>[@model.layers.26.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=976), symbol:model.layers.26.post_attention_layernorm.weight])[symbol:model.layers.26.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%4867:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=977), symbol:model.layers.26.mlp.gate_proj.weight])[symbol:model.layers.26.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%2619:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980), symbol:model.layers.26.mlp.up_proj.weight])[symbol:model.layers.26.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%1355:tensor<[2048, 6144], Float32, CPU>[@model.layers.26.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982), symbol:model.layers.26.mlp.down_proj.weight])[symbol:model.layers.26.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%6381:tensor<[2048], Float32, CPU>[@model.layers.27.input_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=985), symbol:model.layers.27.input_layernorm.weight])[symbol:model.layers.27.input_layernorm.weight]
-            tensor.CPU.register () -> (%5946:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.q_proj.weight][symbol:model.layers.27.self_attn.q_proj.weight])[symbol:model.layers.27.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%1802:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=986), symbol:model.layers.27.self_attn.k_proj.weight])[symbol:model.layers.27.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%6652:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=988), symbol:model.layers.27.self_attn.v_proj.weight])[symbol:model.layers.27.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%6206:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.q_norm.weight][quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=992), symbol:model.layers.27.self_attn.q_norm.weight])[symbol:model.layers.27.self_attn.q_norm.weight]
-            tensor.CPU.register () -> (%1743:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.k_norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=994), symbol:model.layers.27.self_attn.k_norm.weight])[symbol:model.layers.27.self_attn.k_norm.weight]
-            tensor.CPU.register () -> (%5189:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1007), symbol:model.layers.27.self_attn.o_proj.weight])[symbol:model.layers.27.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%3001:tensor<[2048], Float32, CPU>[@model.layers.27.post_attention_layernorm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1010), symbol:model.layers.27.post_attention_layernorm.weight])[symbol:model.layers.27.post_attention_layernorm.weight]
-            tensor.CPU.register () -> (%5561:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1011), symbol:model.layers.27.mlp.gate_proj.weight])[symbol:model.layers.27.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%2731:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1014), symbol:model.layers.27.mlp.up_proj.weight])[symbol:model.layers.27.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%3783:tensor<[2048, 6144], Float32, CPU>[@model.layers.27.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1016), symbol:model.layers.27.mlp.down_proj.weight])[symbol:model.layers.27.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%5765:tensor<[2048], Float32, CPU>[@model.norm.weight][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1019), symbol:model.norm.weight])[symbol:model.norm.weight]
-            tensor.CPU.register () -> (%6130:tensor<[151936, 2048], Float32, CPU>[@lm_head.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1020), symbol:lm_head.weight])[symbol:lm_head.weight]
+            tensor.CPU.register () -> (%361:tensor<[151936, 2048], Float32, CPU>[@model.embed_tokens.weight][quant_recipe:QuantSpec(Raw(type: Float32), uuid=61), symbol:model.embed_tokens.weight])[symbol:model.embed_tokens.weight]
+            tensor.CPU.register () -> (%8204:tensor<[1024, 5, 128], UInt16PerTensor, CPU>[@rope_sin][symbol:rope_sin])[symbol:rope_sin]
+            tensor.CPU.register () -> (%8205:tensor<[1024, 5, 128], UInt16PerTensor, CPU>[@rope_cos][symbol:rope_cos])[symbol:rope_cos]
+            tensor.CPU.register () -> (%4256:tensor<[2048], Float32, CPU>[@model.layers.0.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=67), symbol:model.layers.0.input_layernorm.weight])[symbol:model.layers.0.input_layernorm.weight]
+            tensor.CPU.register () -> (%6100:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68), symbol:model.layers.0.self_attn.q_proj.weight])[symbol:model.layers.0.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%326:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70), symbol:model.layers.0.self_attn.k_proj.weight])[symbol:model.layers.0.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4416:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=72), symbol:model.layers.0.self_attn.v_proj.weight])[symbol:model.layers.0.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7842:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=75), symbol:model.layers.0.self_attn.q_norm.weight])[symbol:model.layers.0.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%8182:tensor<[128], Float32, CPU>[@model.layers.0.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=77), symbol:model.layers.0.self_attn.k_norm.weight])[symbol:model.layers.0.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7659:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=90), symbol:model.layers.0.self_attn.o_proj.weight])[symbol:model.layers.0.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%875:tensor<[2048], Float32, CPU>[@model.layers.0.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=93), symbol:model.layers.0.post_attention_layernorm.weight])[symbol:model.layers.0.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6720:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=94), symbol:model.layers.0.mlp.up_proj.weight])[symbol:model.layers.0.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2083:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96), symbol:model.layers.0.mlp.gate_proj.weight])[symbol:model.layers.0.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%1968:tensor<[2048, 6144], Float32, CPU>[@model.layers.0.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=99), symbol:model.layers.0.mlp.down_proj.weight])[symbol:model.layers.0.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%2912:tensor<[2048], Float32, CPU>[@model.layers.1.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=102), symbol:model.layers.1.input_layernorm.weight])[symbol:model.layers.1.input_layernorm.weight]
+            tensor.CPU.register () -> (%2564:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=103), symbol:model.layers.1.self_attn.q_proj.weight])[symbol:model.layers.1.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%3192:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=105), symbol:model.layers.1.self_attn.k_proj.weight])[symbol:model.layers.1.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3127:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=107), symbol:model.layers.1.self_attn.v_proj.weight])[symbol:model.layers.1.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6782:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=110), symbol:model.layers.1.self_attn.q_norm.weight])[symbol:model.layers.1.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5890:tensor<[128], Float32, CPU>[@model.layers.1.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=112), symbol:model.layers.1.self_attn.k_norm.weight])[symbol:model.layers.1.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%683:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=125), symbol:model.layers.1.self_attn.o_proj.weight])[symbol:model.layers.1.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%181:tensor<[2048], Float32, CPU>[@model.layers.1.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=128), symbol:model.layers.1.post_attention_layernorm.weight])[symbol:model.layers.1.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2963:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=129), symbol:model.layers.1.mlp.up_proj.weight])[symbol:model.layers.1.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%5173:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=131), symbol:model.layers.1.mlp.gate_proj.weight])[symbol:model.layers.1.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5467:tensor<[2048, 6144], Float32, CPU>[@model.layers.1.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=134), symbol:model.layers.1.mlp.down_proj.weight])[symbol:model.layers.1.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%2379:tensor<[2048], Float32, CPU>[@model.layers.2.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=137), symbol:model.layers.2.input_layernorm.weight])[symbol:model.layers.2.input_layernorm.weight]
+            tensor.CPU.register () -> (%3865:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138), symbol:model.layers.2.self_attn.q_proj.weight])[symbol:model.layers.2.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1586:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=140), symbol:model.layers.2.self_attn.k_proj.weight])[symbol:model.layers.2.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4803:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=142), symbol:model.layers.2.self_attn.v_proj.weight])[symbol:model.layers.2.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6973:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=145), symbol:model.layers.2.self_attn.q_norm.weight])[symbol:model.layers.2.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1763:tensor<[128], Float32, CPU>[@model.layers.2.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=147), symbol:model.layers.2.self_attn.k_norm.weight])[symbol:model.layers.2.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6817:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=160), symbol:model.layers.2.self_attn.o_proj.weight])[symbol:model.layers.2.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%984:tensor<[2048], Float32, CPU>[@model.layers.2.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=163), symbol:model.layers.2.post_attention_layernorm.weight])[symbol:model.layers.2.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%1952:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164), symbol:model.layers.2.mlp.up_proj.weight])[symbol:model.layers.2.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6793:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166), symbol:model.layers.2.mlp.gate_proj.weight])[symbol:model.layers.2.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%7125:tensor<[2048, 6144], Float32, CPU>[@model.layers.2.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=169), symbol:model.layers.2.mlp.down_proj.weight])[symbol:model.layers.2.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1636:tensor<[2048], Float32, CPU>[@model.layers.3.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=172), symbol:model.layers.3.input_layernorm.weight])[symbol:model.layers.3.input_layernorm.weight]
+            tensor.CPU.register () -> (%5214:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=173), symbol:model.layers.3.self_attn.q_proj.weight])[symbol:model.layers.3.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6900:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=175), symbol:model.layers.3.self_attn.k_proj.weight])[symbol:model.layers.3.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2141:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=177), symbol:model.layers.3.self_attn.v_proj.weight])[symbol:model.layers.3.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%3669:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=180), symbol:model.layers.3.self_attn.q_norm.weight])[symbol:model.layers.3.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%4334:tensor<[128], Float32, CPU>[@model.layers.3.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=182), symbol:model.layers.3.self_attn.k_norm.weight])[symbol:model.layers.3.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%8150:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195), symbol:model.layers.3.self_attn.o_proj.weight])[symbol:model.layers.3.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4105:tensor<[2048], Float32, CPU>[@model.layers.3.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=198), symbol:model.layers.3.post_attention_layernorm.weight])[symbol:model.layers.3.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6926:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=199), symbol:model.layers.3.mlp.up_proj.weight])[symbol:model.layers.3.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6632:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=201), symbol:model.layers.3.mlp.gate_proj.weight])[symbol:model.layers.3.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%1818:tensor<[2048, 6144], Float32, CPU>[@model.layers.3.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204), symbol:model.layers.3.mlp.down_proj.weight])[symbol:model.layers.3.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%269:tensor<[2048], Float32, CPU>[@model.layers.4.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=207), symbol:model.layers.4.input_layernorm.weight])[symbol:model.layers.4.input_layernorm.weight]
+            tensor.CPU.register () -> (%973:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=208), symbol:model.layers.4.self_attn.q_proj.weight])[symbol:model.layers.4.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6187:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=210), symbol:model.layers.4.self_attn.k_proj.weight])[symbol:model.layers.4.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%6381:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=212), symbol:model.layers.4.self_attn.v_proj.weight])[symbol:model.layers.4.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%466:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=215), symbol:model.layers.4.self_attn.q_norm.weight])[symbol:model.layers.4.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%6834:tensor<[128], Float32, CPU>[@model.layers.4.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=217), symbol:model.layers.4.self_attn.k_norm.weight])[symbol:model.layers.4.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7756:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=230), symbol:model.layers.4.self_attn.o_proj.weight])[symbol:model.layers.4.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4372:tensor<[2048], Float32, CPU>[@model.layers.4.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=233), symbol:model.layers.4.post_attention_layernorm.weight])[symbol:model.layers.4.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6103:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234), symbol:model.layers.4.mlp.up_proj.weight])[symbol:model.layers.4.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2402:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=236), symbol:model.layers.4.mlp.gate_proj.weight])[symbol:model.layers.4.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%355:tensor<[2048, 6144], Float32, CPU>[@model.layers.4.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=239), symbol:model.layers.4.mlp.down_proj.weight])[symbol:model.layers.4.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%7342:tensor<[2048], Float32, CPU>[@model.layers.5.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=242), symbol:model.layers.5.input_layernorm.weight])[symbol:model.layers.5.input_layernorm.weight]
+            tensor.CPU.register () -> (%756:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=243), symbol:model.layers.5.self_attn.q_proj.weight])[symbol:model.layers.5.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%7540:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=245), symbol:model.layers.5.self_attn.k_proj.weight])[symbol:model.layers.5.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%1477:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=247), symbol:model.layers.5.self_attn.v_proj.weight])[symbol:model.layers.5.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%3429:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=250), symbol:model.layers.5.self_attn.q_norm.weight])[symbol:model.layers.5.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%2834:tensor<[128], Float32, CPU>[@model.layers.5.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=252), symbol:model.layers.5.self_attn.k_norm.weight])[symbol:model.layers.5.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%8077:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265), symbol:model.layers.5.self_attn.o_proj.weight])[symbol:model.layers.5.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%5901:tensor<[2048], Float32, CPU>[@model.layers.5.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=268), symbol:model.layers.5.post_attention_layernorm.weight])[symbol:model.layers.5.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%769:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=269), symbol:model.layers.5.mlp.up_proj.weight])[symbol:model.layers.5.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%1874:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271), symbol:model.layers.5.mlp.gate_proj.weight])[symbol:model.layers.5.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%4892:tensor<[2048, 6144], Float32, CPU>[@model.layers.5.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274), symbol:model.layers.5.mlp.down_proj.weight])[symbol:model.layers.5.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3540:tensor<[2048], Float32, CPU>[@model.layers.6.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=277), symbol:model.layers.6.input_layernorm.weight])[symbol:model.layers.6.input_layernorm.weight]
+            tensor.CPU.register () -> (%4173:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=278), symbol:model.layers.6.self_attn.q_proj.weight])[symbol:model.layers.6.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%877:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=280), symbol:model.layers.6.self_attn.k_proj.weight])[symbol:model.layers.6.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%1344:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=282), symbol:model.layers.6.self_attn.v_proj.weight])[symbol:model.layers.6.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7487:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=285), symbol:model.layers.6.self_attn.q_norm.weight])[symbol:model.layers.6.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5126:tensor<[128], Float32, CPU>[@model.layers.6.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=287), symbol:model.layers.6.self_attn.k_norm.weight])[symbol:model.layers.6.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3940:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300), symbol:model.layers.6.self_attn.o_proj.weight])[symbol:model.layers.6.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%5378:tensor<[2048], Float32, CPU>[@model.layers.6.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=303), symbol:model.layers.6.post_attention_layernorm.weight])[symbol:model.layers.6.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%4973:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=304), symbol:model.layers.6.mlp.up_proj.weight])[symbol:model.layers.6.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%7150:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306), symbol:model.layers.6.mlp.gate_proj.weight])[symbol:model.layers.6.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5276:tensor<[2048, 6144], Float32, CPU>[@model.layers.6.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=309), symbol:model.layers.6.mlp.down_proj.weight])[symbol:model.layers.6.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1865:tensor<[2048], Float32, CPU>[@model.layers.7.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=312), symbol:model.layers.7.input_layernorm.weight])[symbol:model.layers.7.input_layernorm.weight]
+            tensor.CPU.register () -> (%7715:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=313), symbol:model.layers.7.self_attn.q_proj.weight])[symbol:model.layers.7.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1658:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=315), symbol:model.layers.7.self_attn.k_proj.weight])[symbol:model.layers.7.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5896:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=317), symbol:model.layers.7.self_attn.v_proj.weight])[symbol:model.layers.7.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7733:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=320), symbol:model.layers.7.self_attn.q_norm.weight])[symbol:model.layers.7.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1643:tensor<[128], Float32, CPU>[@model.layers.7.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=322), symbol:model.layers.7.self_attn.k_norm.weight])[symbol:model.layers.7.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%2968:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=335), symbol:model.layers.7.self_attn.o_proj.weight])[symbol:model.layers.7.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2978:tensor<[2048], Float32, CPU>[@model.layers.7.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=338), symbol:model.layers.7.post_attention_layernorm.weight])[symbol:model.layers.7.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2994:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=339), symbol:model.layers.7.mlp.up_proj.weight])[symbol:model.layers.7.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6231:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=341), symbol:model.layers.7.mlp.gate_proj.weight])[symbol:model.layers.7.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%7639:tensor<[2048, 6144], Float32, CPU>[@model.layers.7.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=344), symbol:model.layers.7.mlp.down_proj.weight])[symbol:model.layers.7.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%2157:tensor<[2048], Float32, CPU>[@model.layers.8.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=347), symbol:model.layers.8.input_layernorm.weight])[symbol:model.layers.8.input_layernorm.weight]
+            tensor.CPU.register () -> (%7895:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=348), symbol:model.layers.8.self_attn.q_proj.weight])[symbol:model.layers.8.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2622:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=350), symbol:model.layers.8.self_attn.k_proj.weight])[symbol:model.layers.8.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5444:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=352), symbol:model.layers.8.self_attn.v_proj.weight])[symbol:model.layers.8.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1167:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=355), symbol:model.layers.8.self_attn.q_norm.weight])[symbol:model.layers.8.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7773:tensor<[128], Float32, CPU>[@model.layers.8.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=357), symbol:model.layers.8.self_attn.k_norm.weight])[symbol:model.layers.8.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%2063:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370), symbol:model.layers.8.self_attn.o_proj.weight])[symbol:model.layers.8.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4799:tensor<[2048], Float32, CPU>[@model.layers.8.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=373), symbol:model.layers.8.post_attention_layernorm.weight])[symbol:model.layers.8.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5512:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374), symbol:model.layers.8.mlp.up_proj.weight])[symbol:model.layers.8.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4801:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376), symbol:model.layers.8.mlp.gate_proj.weight])[symbol:model.layers.8.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5712:tensor<[2048, 6144], Float32, CPU>[@model.layers.8.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=379), symbol:model.layers.8.mlp.down_proj.weight])[symbol:model.layers.8.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3935:tensor<[2048], Float32, CPU>[@model.layers.9.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=382), symbol:model.layers.9.input_layernorm.weight])[symbol:model.layers.9.input_layernorm.weight]
+            tensor.CPU.register () -> (%1754:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=383), symbol:model.layers.9.self_attn.q_proj.weight])[symbol:model.layers.9.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%7274:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=385), symbol:model.layers.9.self_attn.k_proj.weight])[symbol:model.layers.9.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4983:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=387), symbol:model.layers.9.self_attn.v_proj.weight])[symbol:model.layers.9.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1127:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=390), symbol:model.layers.9.self_attn.q_norm.weight])[symbol:model.layers.9.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%964:tensor<[128], Float32, CPU>[@model.layers.9.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=392), symbol:model.layers.9.self_attn.k_norm.weight])[symbol:model.layers.9.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4355:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=405), symbol:model.layers.9.self_attn.o_proj.weight])[symbol:model.layers.9.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%4793:tensor<[2048], Float32, CPU>[@model.layers.9.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=408), symbol:model.layers.9.post_attention_layernorm.weight])[symbol:model.layers.9.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%7662:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=409), symbol:model.layers.9.mlp.up_proj.weight])[symbol:model.layers.9.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6098:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=411), symbol:model.layers.9.mlp.gate_proj.weight])[symbol:model.layers.9.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%333:tensor<[2048, 6144], Float32, CPU>[@model.layers.9.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=414), symbol:model.layers.9.mlp.down_proj.weight])[symbol:model.layers.9.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3044:tensor<[2048], Float32, CPU>[@model.layers.10.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=417), symbol:model.layers.10.input_layernorm.weight])[symbol:model.layers.10.input_layernorm.weight]
+            tensor.CPU.register () -> (%208:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=418), symbol:model.layers.10.self_attn.q_proj.weight])[symbol:model.layers.10.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%5527:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=420), symbol:model.layers.10.self_attn.k_proj.weight])[symbol:model.layers.10.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%2767:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=422), symbol:model.layers.10.self_attn.v_proj.weight])[symbol:model.layers.10.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6433:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=425), symbol:model.layers.10.self_attn.q_norm.weight])[symbol:model.layers.10.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1215:tensor<[128], Float32, CPU>[@model.layers.10.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=427), symbol:model.layers.10.self_attn.k_norm.weight])[symbol:model.layers.10.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%2136:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=440), symbol:model.layers.10.self_attn.o_proj.weight])[symbol:model.layers.10.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%1173:tensor<[2048], Float32, CPU>[@model.layers.10.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=443), symbol:model.layers.10.post_attention_layernorm.weight])[symbol:model.layers.10.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%4087:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444), symbol:model.layers.10.mlp.up_proj.weight])[symbol:model.layers.10.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6334:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=446), symbol:model.layers.10.mlp.gate_proj.weight])[symbol:model.layers.10.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2160:tensor<[2048, 6144], Float32, CPU>[@model.layers.10.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=449), symbol:model.layers.10.mlp.down_proj.weight])[symbol:model.layers.10.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%6029:tensor<[2048], Float32, CPU>[@model.layers.11.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=452), symbol:model.layers.11.input_layernorm.weight])[symbol:model.layers.11.input_layernorm.weight]
+            tensor.CPU.register () -> (%87:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=453), symbol:model.layers.11.self_attn.q_proj.weight])[symbol:model.layers.11.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6705:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=455), symbol:model.layers.11.self_attn.k_proj.weight])[symbol:model.layers.11.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%532:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=457), symbol:model.layers.11.self_attn.v_proj.weight])[symbol:model.layers.11.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%2075:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=460), symbol:model.layers.11.self_attn.q_norm.weight])[symbol:model.layers.11.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5298:tensor<[128], Float32, CPU>[@model.layers.11.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=462), symbol:model.layers.11.self_attn.k_norm.weight])[symbol:model.layers.11.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6489:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475), symbol:model.layers.11.self_attn.o_proj.weight])[symbol:model.layers.11.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%407:tensor<[2048], Float32, CPU>[@model.layers.11.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=478), symbol:model.layers.11.post_attention_layernorm.weight])[symbol:model.layers.11.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6171:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=479), symbol:model.layers.11.mlp.up_proj.weight])[symbol:model.layers.11.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%8146:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=481), symbol:model.layers.11.mlp.gate_proj.weight])[symbol:model.layers.11.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%575:tensor<[2048, 6144], Float32, CPU>[@model.layers.11.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=484), symbol:model.layers.11.mlp.down_proj.weight])[symbol:model.layers.11.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%861:tensor<[2048], Float32, CPU>[@model.layers.12.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=487), symbol:model.layers.12.input_layernorm.weight])[symbol:model.layers.12.input_layernorm.weight]
+            tensor.CPU.register () -> (%1138:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=488), symbol:model.layers.12.self_attn.q_proj.weight])[symbol:model.layers.12.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%8178:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=490), symbol:model.layers.12.self_attn.k_proj.weight])[symbol:model.layers.12.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5503:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=492), symbol:model.layers.12.self_attn.v_proj.weight])[symbol:model.layers.12.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5531:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=495), symbol:model.layers.12.self_attn.q_norm.weight])[symbol:model.layers.12.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7120:tensor<[128], Float32, CPU>[@model.layers.12.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=497), symbol:model.layers.12.self_attn.k_norm.weight])[symbol:model.layers.12.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%3812:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510), symbol:model.layers.12.self_attn.o_proj.weight])[symbol:model.layers.12.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%5701:tensor<[2048], Float32, CPU>[@model.layers.12.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=513), symbol:model.layers.12.post_attention_layernorm.weight])[symbol:model.layers.12.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%1006:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=514), symbol:model.layers.12.mlp.up_proj.weight])[symbol:model.layers.12.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4400:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=516), symbol:model.layers.12.mlp.gate_proj.weight])[symbol:model.layers.12.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%6759:tensor<[2048, 6144], Float32, CPU>[@model.layers.12.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=519), symbol:model.layers.12.mlp.down_proj.weight])[symbol:model.layers.12.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4069:tensor<[2048], Float32, CPU>[@model.layers.13.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=522), symbol:model.layers.13.input_layernorm.weight])[symbol:model.layers.13.input_layernorm.weight]
+            tensor.CPU.register () -> (%6517:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=523), symbol:model.layers.13.self_attn.q_proj.weight])[symbol:model.layers.13.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%7247:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=525), symbol:model.layers.13.self_attn.k_proj.weight])[symbol:model.layers.13.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4830:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=527), symbol:model.layers.13.self_attn.v_proj.weight])[symbol:model.layers.13.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7510:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=530), symbol:model.layers.13.self_attn.q_norm.weight])[symbol:model.layers.13.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1546:tensor<[128], Float32, CPU>[@model.layers.13.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=532), symbol:model.layers.13.self_attn.k_norm.weight])[symbol:model.layers.13.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4956:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=545), symbol:model.layers.13.self_attn.o_proj.weight])[symbol:model.layers.13.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%1863:tensor<[2048], Float32, CPU>[@model.layers.13.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=548), symbol:model.layers.13.post_attention_layernorm.weight])[symbol:model.layers.13.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%4198:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=549), symbol:model.layers.13.mlp.up_proj.weight])[symbol:model.layers.13.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%3651:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=551), symbol:model.layers.13.mlp.gate_proj.weight])[symbol:model.layers.13.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%5457:tensor<[2048, 6144], Float32, CPU>[@model.layers.13.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=554), symbol:model.layers.13.mlp.down_proj.weight])[symbol:model.layers.13.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4807:tensor<[2048], Float32, CPU>[@model.layers.14.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=557), symbol:model.layers.14.input_layernorm.weight])[symbol:model.layers.14.input_layernorm.weight]
+            tensor.CPU.register () -> (%2924:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=558), symbol:model.layers.14.self_attn.q_proj.weight])[symbol:model.layers.14.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6136:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=560), symbol:model.layers.14.self_attn.k_proj.weight])[symbol:model.layers.14.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5240:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=562), symbol:model.layers.14.self_attn.v_proj.weight])[symbol:model.layers.14.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%3852:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=565), symbol:model.layers.14.self_attn.q_norm.weight])[symbol:model.layers.14.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5634:tensor<[128], Float32, CPU>[@model.layers.14.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=567), symbol:model.layers.14.self_attn.k_norm.weight])[symbol:model.layers.14.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%331:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580), symbol:model.layers.14.self_attn.o_proj.weight])[symbol:model.layers.14.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%7059:tensor<[2048], Float32, CPU>[@model.layers.14.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=583), symbol:model.layers.14.post_attention_layernorm.weight])[symbol:model.layers.14.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%631:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=584), symbol:model.layers.14.mlp.up_proj.weight])[symbol:model.layers.14.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2479:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=586), symbol:model.layers.14.mlp.gate_proj.weight])[symbol:model.layers.14.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%4629:tensor<[2048, 6144], Float32, CPU>[@model.layers.14.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=589), symbol:model.layers.14.mlp.down_proj.weight])[symbol:model.layers.14.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%1464:tensor<[2048], Float32, CPU>[@model.layers.15.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=592), symbol:model.layers.15.input_layernorm.weight])[symbol:model.layers.15.input_layernorm.weight]
+            tensor.CPU.register () -> (%4989:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=593), symbol:model.layers.15.self_attn.q_proj.weight])[symbol:model.layers.15.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%2031:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=595), symbol:model.layers.15.self_attn.k_proj.weight])[symbol:model.layers.15.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%1922:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=597), symbol:model.layers.15.self_attn.v_proj.weight])[symbol:model.layers.15.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6176:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=600), symbol:model.layers.15.self_attn.q_norm.weight])[symbol:model.layers.15.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%5870:tensor<[128], Float32, CPU>[@model.layers.15.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=602), symbol:model.layers.15.self_attn.k_norm.weight])[symbol:model.layers.15.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6498:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=615), symbol:model.layers.15.self_attn.o_proj.weight])[symbol:model.layers.15.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%7534:tensor<[2048], Float32, CPU>[@model.layers.15.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=618), symbol:model.layers.15.post_attention_layernorm.weight])[symbol:model.layers.15.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%4158:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=619), symbol:model.layers.15.mlp.up_proj.weight])[symbol:model.layers.15.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%5708:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=621), symbol:model.layers.15.mlp.gate_proj.weight])[symbol:model.layers.15.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%6996:tensor<[2048, 6144], Float32, CPU>[@model.layers.15.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=624), symbol:model.layers.15.mlp.down_proj.weight])[symbol:model.layers.15.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5186:tensor<[2048], Float32, CPU>[@model.layers.16.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=627), symbol:model.layers.16.input_layernorm.weight])[symbol:model.layers.16.input_layernorm.weight]
+            tensor.CPU.register () -> (%3600:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=628), symbol:model.layers.16.self_attn.q_proj.weight])[symbol:model.layers.16.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%7334:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=630), symbol:model.layers.16.self_attn.k_proj.weight])[symbol:model.layers.16.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%1736:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=632), symbol:model.layers.16.self_attn.v_proj.weight])[symbol:model.layers.16.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%8015:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=635), symbol:model.layers.16.self_attn.q_norm.weight])[symbol:model.layers.16.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%8043:tensor<[128], Float32, CPU>[@model.layers.16.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=637), symbol:model.layers.16.self_attn.k_norm.weight])[symbol:model.layers.16.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%1749:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=650), symbol:model.layers.16.self_attn.o_proj.weight])[symbol:model.layers.16.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3582:tensor<[2048], Float32, CPU>[@model.layers.16.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=653), symbol:model.layers.16.post_attention_layernorm.weight])[symbol:model.layers.16.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6009:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=654), symbol:model.layers.16.mlp.up_proj.weight])[symbol:model.layers.16.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%2546:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=656), symbol:model.layers.16.mlp.gate_proj.weight])[symbol:model.layers.16.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%3430:tensor<[2048, 6144], Float32, CPU>[@model.layers.16.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=659), symbol:model.layers.16.mlp.down_proj.weight])[symbol:model.layers.16.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4318:tensor<[2048], Float32, CPU>[@model.layers.17.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=662), symbol:model.layers.17.input_layernorm.weight])[symbol:model.layers.17.input_layernorm.weight]
+            tensor.CPU.register () -> (%5713:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=663), symbol:model.layers.17.self_attn.q_proj.weight])[symbol:model.layers.17.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%5811:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=665), symbol:model.layers.17.self_attn.k_proj.weight])[symbol:model.layers.17.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4106:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667), symbol:model.layers.17.self_attn.v_proj.weight])[symbol:model.layers.17.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6494:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=670), symbol:model.layers.17.self_attn.q_norm.weight])[symbol:model.layers.17.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7738:tensor<[128], Float32, CPU>[@model.layers.17.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=672), symbol:model.layers.17.self_attn.k_norm.weight])[symbol:model.layers.17.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7459:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=685), symbol:model.layers.17.self_attn.o_proj.weight])[symbol:model.layers.17.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%855:tensor<[2048], Float32, CPU>[@model.layers.17.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=688), symbol:model.layers.17.post_attention_layernorm.weight])[symbol:model.layers.17.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%8058:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=689), symbol:model.layers.17.mlp.up_proj.weight])[symbol:model.layers.17.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6964:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=691), symbol:model.layers.17.mlp.gate_proj.weight])[symbol:model.layers.17.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2577:tensor<[2048, 6144], Float32, CPU>[@model.layers.17.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=694), symbol:model.layers.17.mlp.down_proj.weight])[symbol:model.layers.17.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%3926:tensor<[2048], Float32, CPU>[@model.layers.18.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=697), symbol:model.layers.18.input_layernorm.weight])[symbol:model.layers.18.input_layernorm.weight]
+            tensor.CPU.register () -> (%1917:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=698), symbol:model.layers.18.self_attn.q_proj.weight])[symbol:model.layers.18.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%1580:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=700), symbol:model.layers.18.self_attn.k_proj.weight])[symbol:model.layers.18.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%4657:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=702), symbol:model.layers.18.self_attn.v_proj.weight])[symbol:model.layers.18.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5451:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=705), symbol:model.layers.18.self_attn.q_norm.weight])[symbol:model.layers.18.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3229:tensor<[128], Float32, CPU>[@model.layers.18.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=707), symbol:model.layers.18.self_attn.k_norm.weight])[symbol:model.layers.18.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%1514:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=720), symbol:model.layers.18.self_attn.o_proj.weight])[symbol:model.layers.18.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%910:tensor<[2048], Float32, CPU>[@model.layers.18.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=723), symbol:model.layers.18.post_attention_layernorm.weight])[symbol:model.layers.18.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2694:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=724), symbol:model.layers.18.mlp.up_proj.weight])[symbol:model.layers.18.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4440:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=726), symbol:model.layers.18.mlp.gate_proj.weight])[symbol:model.layers.18.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%6785:tensor<[2048, 6144], Float32, CPU>[@model.layers.18.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=729), symbol:model.layers.18.mlp.down_proj.weight])[symbol:model.layers.18.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5637:tensor<[2048], Float32, CPU>[@model.layers.19.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=732), symbol:model.layers.19.input_layernorm.weight])[symbol:model.layers.19.input_layernorm.weight]
+            tensor.CPU.register () -> (%542:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=733), symbol:model.layers.19.self_attn.q_proj.weight])[symbol:model.layers.19.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6845:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735), symbol:model.layers.19.self_attn.k_proj.weight])[symbol:model.layers.19.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%6082:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=737), symbol:model.layers.19.self_attn.v_proj.weight])[symbol:model.layers.19.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6718:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=740), symbol:model.layers.19.self_attn.q_norm.weight])[symbol:model.layers.19.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1204:tensor<[128], Float32, CPU>[@model.layers.19.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=742), symbol:model.layers.19.self_attn.k_norm.weight])[symbol:model.layers.19.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7572:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=755), symbol:model.layers.19.self_attn.o_proj.weight])[symbol:model.layers.19.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3257:tensor<[2048], Float32, CPU>[@model.layers.19.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=758), symbol:model.layers.19.post_attention_layernorm.weight])[symbol:model.layers.19.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6762:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=759), symbol:model.layers.19.mlp.up_proj.weight])[symbol:model.layers.19.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%3095:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=761), symbol:model.layers.19.mlp.gate_proj.weight])[symbol:model.layers.19.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%3251:tensor<[2048, 6144], Float32, CPU>[@model.layers.19.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=764), symbol:model.layers.19.mlp.down_proj.weight])[symbol:model.layers.19.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%2201:tensor<[2048], Float32, CPU>[@model.layers.20.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=767), symbol:model.layers.20.input_layernorm.weight])[symbol:model.layers.20.input_layernorm.weight]
+            tensor.CPU.register () -> (%196:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=768), symbol:model.layers.20.self_attn.q_proj.weight])[symbol:model.layers.20.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%179:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=770), symbol:model.layers.20.self_attn.k_proj.weight])[symbol:model.layers.20.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3406:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=772), symbol:model.layers.20.self_attn.v_proj.weight])[symbol:model.layers.20.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%760:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=775), symbol:model.layers.20.self_attn.q_norm.weight])[symbol:model.layers.20.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%2753:tensor<[128], Float32, CPU>[@model.layers.20.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=777), symbol:model.layers.20.self_attn.k_norm.weight])[symbol:model.layers.20.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%5869:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=790), symbol:model.layers.20.self_attn.o_proj.weight])[symbol:model.layers.20.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%771:tensor<[2048], Float32, CPU>[@model.layers.20.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=793), symbol:model.layers.20.post_attention_layernorm.weight])[symbol:model.layers.20.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2006:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=794), symbol:model.layers.20.mlp.up_proj.weight])[symbol:model.layers.20.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6525:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=796), symbol:model.layers.20.mlp.gate_proj.weight])[symbol:model.layers.20.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%6967:tensor<[2048, 6144], Float32, CPU>[@model.layers.20.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=799), symbol:model.layers.20.mlp.down_proj.weight])[symbol:model.layers.20.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4395:tensor<[2048], Float32, CPU>[@model.layers.21.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=802), symbol:model.layers.21.input_layernorm.weight])[symbol:model.layers.21.input_layernorm.weight]
+            tensor.CPU.register () -> (%4630:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803), symbol:model.layers.21.self_attn.q_proj.weight])[symbol:model.layers.21.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%4948:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=805), symbol:model.layers.21.self_attn.k_proj.weight])[symbol:model.layers.21.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%5162:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807), symbol:model.layers.21.self_attn.v_proj.weight])[symbol:model.layers.21.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%7535:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=810), symbol:model.layers.21.self_attn.q_norm.weight])[symbol:model.layers.21.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1698:tensor<[128], Float32, CPU>[@model.layers.21.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=812), symbol:model.layers.21.self_attn.k_norm.weight])[symbol:model.layers.21.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4030:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=825), symbol:model.layers.21.self_attn.o_proj.weight])[symbol:model.layers.21.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3010:tensor<[2048], Float32, CPU>[@model.layers.21.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=828), symbol:model.layers.21.post_attention_layernorm.weight])[symbol:model.layers.21.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%5608:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=829), symbol:model.layers.21.mlp.up_proj.weight])[symbol:model.layers.21.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%4800:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=831), symbol:model.layers.21.mlp.gate_proj.weight])[symbol:model.layers.21.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%3518:tensor<[2048, 6144], Float32, CPU>[@model.layers.21.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=834), symbol:model.layers.21.mlp.down_proj.weight])[symbol:model.layers.21.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%5381:tensor<[2048], Float32, CPU>[@model.layers.22.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=837), symbol:model.layers.22.input_layernorm.weight])[symbol:model.layers.22.input_layernorm.weight]
+            tensor.CPU.register () -> (%956:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=838), symbol:model.layers.22.self_attn.q_proj.weight])[symbol:model.layers.22.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%4159:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=840), symbol:model.layers.22.self_attn.k_proj.weight])[symbol:model.layers.22.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%6713:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=842), symbol:model.layers.22.self_attn.v_proj.weight])[symbol:model.layers.22.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%1181:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=845), symbol:model.layers.22.self_attn.q_norm.weight])[symbol:model.layers.22.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3001:tensor<[128], Float32, CPU>[@model.layers.22.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=847), symbol:model.layers.22.self_attn.k_norm.weight])[symbol:model.layers.22.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%8084:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=860), symbol:model.layers.22.self_attn.o_proj.weight])[symbol:model.layers.22.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%357:tensor<[2048], Float32, CPU>[@model.layers.22.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=863), symbol:model.layers.22.post_attention_layernorm.weight])[symbol:model.layers.22.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%1068:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=864), symbol:model.layers.22.mlp.up_proj.weight])[symbol:model.layers.22.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%5057:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=866), symbol:model.layers.22.mlp.gate_proj.weight])[symbol:model.layers.22.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%698:tensor<[2048, 6144], Float32, CPU>[@model.layers.22.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=869), symbol:model.layers.22.mlp.down_proj.weight])[symbol:model.layers.22.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%456:tensor<[2048], Float32, CPU>[@model.layers.23.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=872), symbol:model.layers.23.input_layernorm.weight])[symbol:model.layers.23.input_layernorm.weight]
+            tensor.CPU.register () -> (%5941:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=873), symbol:model.layers.23.self_attn.q_proj.weight])[symbol:model.layers.23.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%4304:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875), symbol:model.layers.23.self_attn.k_proj.weight])[symbol:model.layers.23.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3738:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=877), symbol:model.layers.23.self_attn.v_proj.weight])[symbol:model.layers.23.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5237:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=880), symbol:model.layers.23.self_attn.q_norm.weight])[symbol:model.layers.23.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%280:tensor<[128], Float32, CPU>[@model.layers.23.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=882), symbol:model.layers.23.self_attn.k_norm.weight])[symbol:model.layers.23.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%7541:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=895), symbol:model.layers.23.self_attn.o_proj.weight])[symbol:model.layers.23.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2827:tensor<[2048], Float32, CPU>[@model.layers.23.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=898), symbol:model.layers.23.post_attention_layernorm.weight])[symbol:model.layers.23.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%2427:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=899), symbol:model.layers.23.mlp.up_proj.weight])[symbol:model.layers.23.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%5935:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=901), symbol:model.layers.23.mlp.gate_proj.weight])[symbol:model.layers.23.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%11:tensor<[2048, 6144], Float32, CPU>[@model.layers.23.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=904), symbol:model.layers.23.mlp.down_proj.weight])[symbol:model.layers.23.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%4063:tensor<[2048], Float32, CPU>[@model.layers.24.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=907), symbol:model.layers.24.input_layernorm.weight])[symbol:model.layers.24.input_layernorm.weight]
+            tensor.CPU.register () -> (%1741:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=908), symbol:model.layers.24.self_attn.q_proj.weight])[symbol:model.layers.24.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%7443:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=910), symbol:model.layers.24.self_attn.k_proj.weight])[symbol:model.layers.24.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3162:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912), symbol:model.layers.24.self_attn.v_proj.weight])[symbol:model.layers.24.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5942:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=915), symbol:model.layers.24.self_attn.q_norm.weight])[symbol:model.layers.24.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%1980:tensor<[128], Float32, CPU>[@model.layers.24.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=917), symbol:model.layers.24.self_attn.k_norm.weight])[symbol:model.layers.24.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%5547:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=930), symbol:model.layers.24.self_attn.o_proj.weight])[symbol:model.layers.24.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%5937:tensor<[2048], Float32, CPU>[@model.layers.24.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=933), symbol:model.layers.24.post_attention_layernorm.weight])[symbol:model.layers.24.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%6475:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=934), symbol:model.layers.24.mlp.up_proj.weight])[symbol:model.layers.24.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%7634:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=936), symbol:model.layers.24.mlp.gate_proj.weight])[symbol:model.layers.24.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2837:tensor<[2048, 6144], Float32, CPU>[@model.layers.24.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939), symbol:model.layers.24.mlp.down_proj.weight])[symbol:model.layers.24.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%2698:tensor<[2048], Float32, CPU>[@model.layers.25.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=942), symbol:model.layers.25.input_layernorm.weight])[symbol:model.layers.25.input_layernorm.weight]
+            tensor.CPU.register () -> (%7312:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943), symbol:model.layers.25.self_attn.q_proj.weight])[symbol:model.layers.25.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%8046:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=945), symbol:model.layers.25.self_attn.k_proj.weight])[symbol:model.layers.25.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%8035:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=947), symbol:model.layers.25.self_attn.v_proj.weight])[symbol:model.layers.25.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5499:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=950), symbol:model.layers.25.self_attn.q_norm.weight])[symbol:model.layers.25.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3571:tensor<[128], Float32, CPU>[@model.layers.25.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=952), symbol:model.layers.25.self_attn.k_norm.weight])[symbol:model.layers.25.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%6118:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=965), symbol:model.layers.25.self_attn.o_proj.weight])[symbol:model.layers.25.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%3125:tensor<[2048], Float32, CPU>[@model.layers.25.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=968), symbol:model.layers.25.post_attention_layernorm.weight])[symbol:model.layers.25.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%1187:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=969), symbol:model.layers.25.mlp.up_proj.weight])[symbol:model.layers.25.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%327:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=971), symbol:model.layers.25.mlp.gate_proj.weight])[symbol:model.layers.25.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%1157:tensor<[2048, 6144], Float32, CPU>[@model.layers.25.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=974), symbol:model.layers.25.mlp.down_proj.weight])[symbol:model.layers.25.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%6051:tensor<[2048], Float32, CPU>[@model.layers.26.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=977), symbol:model.layers.26.input_layernorm.weight])[symbol:model.layers.26.input_layernorm.weight]
+            tensor.CPU.register () -> (%3763:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=978), symbol:model.layers.26.self_attn.q_proj.weight])[symbol:model.layers.26.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%6974:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980), symbol:model.layers.26.self_attn.k_proj.weight])[symbol:model.layers.26.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%3131:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982), symbol:model.layers.26.self_attn.v_proj.weight])[symbol:model.layers.26.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%5543:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=985), symbol:model.layers.26.self_attn.q_norm.weight])[symbol:model.layers.26.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%7751:tensor<[128], Float32, CPU>[@model.layers.26.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=987), symbol:model.layers.26.self_attn.k_norm.weight])[symbol:model.layers.26.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%4475:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1000), symbol:model.layers.26.self_attn.o_proj.weight])[symbol:model.layers.26.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%7597:tensor<[2048], Float32, CPU>[@model.layers.26.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1003), symbol:model.layers.26.post_attention_layernorm.weight])[symbol:model.layers.26.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3458:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1004), symbol:model.layers.26.mlp.up_proj.weight])[symbol:model.layers.26.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6097:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1006), symbol:model.layers.26.mlp.gate_proj.weight])[symbol:model.layers.26.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%1186:tensor<[2048, 6144], Float32, CPU>[@model.layers.26.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1009), symbol:model.layers.26.mlp.down_proj.weight])[symbol:model.layers.26.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%6869:tensor<[2048], Float32, CPU>[@model.layers.27.input_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1012), symbol:model.layers.27.input_layernorm.weight])[symbol:model.layers.27.input_layernorm.weight]
+            tensor.CPU.register () -> (%513:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.q_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1013), symbol:model.layers.27.self_attn.q_proj.weight])[symbol:model.layers.27.self_attn.q_proj.weight]
+            tensor.CPU.register () -> (%49:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1015), symbol:model.layers.27.self_attn.k_proj.weight])[symbol:model.layers.27.self_attn.k_proj.weight]
+            tensor.CPU.register () -> (%7169:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1017), symbol:model.layers.27.self_attn.v_proj.weight])[symbol:model.layers.27.self_attn.v_proj.weight]
+            tensor.CPU.register () -> (%6403:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.q_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1020), symbol:model.layers.27.self_attn.q_norm.weight])[symbol:model.layers.27.self_attn.q_norm.weight]
+            tensor.CPU.register () -> (%3420:tensor<[128], Float32, CPU>[@model.layers.27.self_attn.k_norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1022), symbol:model.layers.27.self_attn.k_norm.weight])[symbol:model.layers.27.self_attn.k_norm.weight]
+            tensor.CPU.register () -> (%250:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1035), symbol:model.layers.27.self_attn.o_proj.weight])[symbol:model.layers.27.self_attn.o_proj.weight]
+            tensor.CPU.register () -> (%2391:tensor<[2048], Float32, CPU>[@model.layers.27.post_attention_layernorm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1038), symbol:model.layers.27.post_attention_layernorm.weight])[symbol:model.layers.27.post_attention_layernorm.weight]
+            tensor.CPU.register () -> (%3707:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1039), symbol:model.layers.27.mlp.up_proj.weight])[symbol:model.layers.27.mlp.up_proj.weight]
+            tensor.CPU.register () -> (%6283:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1041), symbol:model.layers.27.mlp.gate_proj.weight])[symbol:model.layers.27.mlp.gate_proj.weight]
+            tensor.CPU.register () -> (%2073:tensor<[2048, 6144], Float32, CPU>[@model.layers.27.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1044), symbol:model.layers.27.mlp.down_proj.weight])[symbol:model.layers.27.mlp.down_proj.weight]
+            tensor.CPU.register () -> (%6469:tensor<[2048], Float32, CPU>[@model.norm.weight][quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1047), symbol:model.norm.weight])[symbol:model.norm.weight]
+            tensor.CPU.register () -> (%2672:tensor<[151936, 2048], Float32, CPU>[@lm_head.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1048), symbol:lm_head.weight])[symbol:lm_head.weight]
         }
     }
     graph.SubGraphOp @deinit <notype> [symbol:deinit] {
@@ -321,1697 +321,2116 @@
             
         }
     }
-    graph.CallGraphOp @model (%8013:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8071:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
+    graph.CallGraphOp @model (%8206:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8264:tensor<[32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=1)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
     graph.SubGraphOp @model <CPU> [using_qnn:true, symbol:model] {
-        (%8013:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8071:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) {
-            linalg.CPU.EmbeddingOp <name="model.embed_tokens">(%8013:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)]) -> (%8072:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)])
-            linalg.CPU.CastTypeOp <name="model.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float32), uuid=59), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), weight_weight:QuantSpec(Raw(type: Float32), uuid=61))] (%8072:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)]) -> (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)])
-            linalg.CPU.ViewOp <name="model.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int64), uuid=1), outputs_0:QuantSpec(Raw(type: Int64), uuid=1), )] (%8071:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)]) -> (%8071:tensor<[32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)])
-            linalg.CPU.IndexOp <name="model.Index.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), )] (%8011:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_sin][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), symbol:rope_sin]) -> (%8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)])
-            linalg.CPU.IndexOp <name="model.Index.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), )] (%8012:tensor<[1, 1024, 128], Int16PerTensor, CPU>[@rope_cos][quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), symbol:rope_cos]) -> (%8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)])
-            graph.CallGraphOp @model.layers.0 (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)])
-            graph.CallGraphOp @model.layers.1 (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)])
-            graph.CallGraphOp @model.layers.2 (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)])
-            graph.CallGraphOp @model.layers.3 (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)])
-            graph.CallGraphOp @model.layers.4 (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)])
-            graph.CallGraphOp @model.layers.5 (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)])
-            graph.CallGraphOp @model.layers.6 (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
-            graph.CallGraphOp @model.layers.7 (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)])
-            graph.CallGraphOp @model.layers.8 (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)])
-            graph.CallGraphOp @model.layers.9 (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)])
-            graph.CallGraphOp @model.layers.10 (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)])
-            graph.CallGraphOp @model.layers.11 (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)])
-            graph.CallGraphOp @model.layers.12 (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)])
-            graph.CallGraphOp @model.layers.13 (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)])
-            graph.CallGraphOp @model.layers.14 (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
-            graph.CallGraphOp @model.layers.15 (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)])
-            graph.CallGraphOp @model.layers.16 (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)])
-            graph.CallGraphOp @model.layers.17 (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)])
-            graph.CallGraphOp @model.layers.18 (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)])
-            graph.CallGraphOp @model.layers.19 (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)])
-            graph.CallGraphOp @model.layers.20 (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)])
-            graph.CallGraphOp @model.layers.21 (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
-            graph.CallGraphOp @model.layers.22 (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)])
-            graph.CallGraphOp @model.layers.23 (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)])
-            graph.CallGraphOp @model.layers.24 (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)])
-            graph.CallGraphOp @model.layers.25 (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)])
-            graph.CallGraphOp @model.layers.26 (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)])
-            graph.CallGraphOp @model.layers.27 (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
-            linalg.CPU.RMSNormOp <name="model.norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1019))] (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) -> (%9224:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018)])
-            linalg.CPU.LinearOp <name="lm_head"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1020)), using_qnn:true] (%9224:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1018)]) -> (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)])
-            cf.ReturnOp (%9225:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1021)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> ()
+        (%8206:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8264:tensor<[32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=1)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) {
+            linalg.CPU.EmbeddingOp <name="model.embed_tokens">(%8206:tensor<[1, 32], Int32, CPU>[quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)]) -> (%8265:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)])
+            linalg.CPU.CastTypeOp <name="model.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float32), uuid=59, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), weight_weight:QuantSpec(Raw(type: Float32), uuid=61, solved=0))] (%8265:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)]) -> (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.IndexOp <name="model.Index.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=62, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8204:tensor<[1024, 5, 128], UInt16PerTensor, CPU>[@rope_sin][quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=62), symbol:rope_sin]) -> (%8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.IndexOp <name="model.Index.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=64, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8205:tensor<[1024, 5, 128], UInt16PerTensor, CPU>[@rope_cos][quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=64), symbol:rope_cos]) -> (%8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            graph.CallGraphOp @model.layers.0 (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)])
+            graph.CallGraphOp @model.layers.1 (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)])
+            graph.CallGraphOp @model.layers.2 (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)])
+            graph.CallGraphOp @model.layers.3 (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)])
+            graph.CallGraphOp @model.layers.4 (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)])
+            graph.CallGraphOp @model.layers.5 (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)])
+            graph.CallGraphOp @model.layers.6 (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)])
+            graph.CallGraphOp @model.layers.7 (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)])
+            graph.CallGraphOp @model.layers.8 (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)])
+            graph.CallGraphOp @model.layers.9 (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)])
+            graph.CallGraphOp @model.layers.10 (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)])
+            graph.CallGraphOp @model.layers.11 (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)])
+            graph.CallGraphOp @model.layers.12 (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)])
+            graph.CallGraphOp @model.layers.13 (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)])
+            graph.CallGraphOp @model.layers.14 (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)])
+            graph.CallGraphOp @model.layers.15 (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)])
+            graph.CallGraphOp @model.layers.16 (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)])
+            graph.CallGraphOp @model.layers.17 (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)])
+            graph.CallGraphOp @model.layers.18 (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)])
+            graph.CallGraphOp @model.layers.19 (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)])
+            graph.CallGraphOp @model.layers.20 (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)])
+            graph.CallGraphOp @model.layers.21 (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)])
+            graph.CallGraphOp @model.layers.22 (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)])
+            graph.CallGraphOp @model.layers.23 (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)])
+            graph.CallGraphOp @model.layers.24 (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)])
+            graph.CallGraphOp @model.layers.25 (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)])
+            graph.CallGraphOp @model.layers.26 (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)])
+            graph.CallGraphOp @model.layers.27 (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
+            linalg.CPU.RMSNormOp <name="model.norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1047, solved=0))] (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9725:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046)])
+            linalg.CPU.LinearOp <name="lm_head"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1048, solved=0)), using_qnn:true] (%9725:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)])
+            cf.ReturnOp (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.0 <CPU> [using_qnn:true, symbol:model.layers.0] {
-        (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.0.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67))] (%8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)])
-            graph.CallGraphOp @model.layers.0.self_attn (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)])
-            linalg.CPU.AddOp <name="model.layers.0.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), )] (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8073:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%8109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92))] (%8109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)])
-            graph.CallGraphOp @model.layers.0.mlp (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            linalg.CPU.AddOp <name="model.layers.0.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), )] (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            cf.ReturnOp (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) -> ()
+        (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.0.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=67, solved=0))] (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)])
+            graph.CallGraphOp @model.layers.0.self_attn (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)], %8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)])
+            linalg.CPU.AddOp <name="model.layers.0.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)]) -> (%8312:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=93, solved=0))] (%8312:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)])
+            graph.CallGraphOp @model.layers.0.mlp (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)]) -> (%8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)])
+            linalg.CPU.AddOp <name="model.layers.0.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8312:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)]) -> (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.0.self_attn <CPU> [using_qnn:true, symbol:model.layers.0.self_attn] {
-        (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) {
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.q_proj">(%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)]) -> (%8077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68))] (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)]) -> (%8078:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70))] (%8076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=66)]) -> (%8079:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), )] (%8077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)]) -> (%8077:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), )] (%8077:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)]) -> (%8080:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), )] (%8078:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%8078:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), )] (%8078:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%8081:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), )] (%8079:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)]) -> (%8079:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), )] (%8079:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)]) -> (%8082:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=72), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=74))] (%8080:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=72)]) -> (%8083:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=76))] (%8081:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%8084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)])
-            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), )] (%8083:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)])
-            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), )] (%8084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75), outputs_0:QuantSpec(Raw(type: Float16), uuid=77), )] (%8086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=75)]) -> (%8087:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=77)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=77), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), )] (%8087:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=77)]) -> (%8088:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), )] (%8088:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)]) -> (%8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), outputs_0:QuantSpec(Raw(type: Float16), uuid=79), )] (%8082:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)]) -> (%8090:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=79)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=79), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80), )] (%8090:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=79)]) -> (%8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)])
-            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%8015:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)]) -> (%8092:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
-            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%8016:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) -> (%8093:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
-            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%8092:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8094:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
-            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%8093:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8095:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
-            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), )] (%8085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=73)], %8094:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8096:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)])
-            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), inputs_1:QuantSpec(Raw(type: Float32), uuid=82), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), )] (%8096:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)], %8097:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=82), constant:[0.088388346]]) -> (%8098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)])
-            linalg.CPU.ReduceMinOp <name="model.layers.0.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%8098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)]) -> (%8099:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
-            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), inputs_1:QuantSpec(Raw(type: Int16), uuid=84), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%8099:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)], %8100:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=84), constant:[-20]]) -> (%8101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
-            linalg.CPU.EqualOp <name="model.layers.0.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=85), outputs_0:QuantSpec(Raw(type: UInt8), uuid=86), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8102:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=85), constant:[0]]) -> (%8103:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=86)])
-            linalg.CPU.WhereOp <name="model.layers.0.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=86), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%8103:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=86)], %8098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=81)], %8101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)]) -> (%8104:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
-            linalg.CPU.SoftmaxOp <name="model.layers.0.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), )] (%8104:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)]) -> (%8105:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)])
-            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), )] (%8105:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)], %8095:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%8106:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), )] (%8106:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)]) -> (%8107:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), )] (%8107:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)]) -> (%8107:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=89))] (%8107:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=88)]) -> (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
-            cf.ReturnOp (%8108:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %8089:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=78)], %8091:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=80)]) -> ()
+        (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)], %8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)]) {
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68, solved=0))] (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)]) -> (%8270:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70, solved=0))] (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)]) -> (%8271:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=72, solved=0))] (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)]) -> (%8272:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), )] (%8270:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)]) -> (%8270:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), )] (%8270:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)]) -> (%8273:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), )] (%8271:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)]) -> (%8271:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), )] (%8271:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)]) -> (%8274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), )] (%8272:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)]) -> (%8272:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), )] (%8272:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)]) -> (%8275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=75, solved=0))] (%8273:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)]) -> (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=77, solved=0))] (%8274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)]) -> (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.NegOp <name="model.layers.0.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8278:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8278:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8279:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8279:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8280:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8281:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8281:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8280:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8282:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.NegOp <name="model.layers.0.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8283:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8283:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8284:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8284:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8285:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8286:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8286:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8285:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8287:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=78, solved=0), )] (%8287:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8288:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=78)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=78, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), )] (%8288:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=78)]) -> (%8289:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), )] (%8289:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)]) -> (%8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=80, solved=0), )] (%8275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)]) -> (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=80)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=80, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81, solved=0), )] (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=80)]) -> (%8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), )] (%8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)]) -> (%8295:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), )] (%8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)]) -> (%8296:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)])
+            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), )] (%8295:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8297:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
+            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), )] (%8296:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8298:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)])
+            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), )] (%8282:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8297:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8299:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=83, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), )] (%8299:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)], %8300:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=83), constant:[0.088388346]]) -> (%8301:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)])
+            linalg.CPU.ReduceMinOp <name="model.layers.0.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), )] (%8301:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)]) -> (%8302:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=85, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), )] (%8302:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)], %8303:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=85), constant:[-20]]) -> (%8304:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)])
+            linalg.CPU.EqualOp <name="model.layers.0.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=86, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=87, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8305:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=86), constant:[0]]) -> (%8306:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=87)])
+            linalg.CPU.WhereOp <name="model.layers.0.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=87, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), )] (%8306:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=87)], %8301:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)], %8304:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)]) -> (%8307:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)])
+            linalg.CPU.SoftmaxOp <name="model.layers.0.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88, solved=0), )] (%8307:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)]) -> (%8308:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88)])
+            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), )] (%8308:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88)], %8298:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8309:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), )] (%8309:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)]) -> (%8310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), )] (%8310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)]) -> (%8310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=90, solved=0))] (%8310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)]) -> (%8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)])
+            cf.ReturnOp (%8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.0.mlp <CPU> [using_qnn:true, symbol:model.layers.0.mlp] {
-        (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) {
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=93))] (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8111:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
-            linalg.CPU.SiLUOp <name="model.layers.0.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), )] (%8111:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) -> (%8112:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)])
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96))] (%8110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=91)]) -> (%8113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)])
-            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), )] (%8112:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)], %8113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)]) -> (%8114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)])
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=98))] (%8114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)]) -> (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            cf.ReturnOp (%8115:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> ()
+        (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)]) -> (%8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)]) {
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=94, solved=0))] (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)]) -> (%8314:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96, solved=0))] (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)]) -> (%8315:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)])
+            linalg.CPU.SigmoidOp <name="model.layers.0.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98, solved=0), )] (%8315:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)]) -> (%8316:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98)])
+            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), )] (%8315:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)], %8316:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98)]) -> (%8317:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)])
+            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), )] (%8317:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)], %8314:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95)]) -> (%8318:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=99, solved=0))] (%8318:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)]) -> (%8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)])
+            cf.ReturnOp (%8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.1 <CPU> [using_qnn:true, symbol:model.layers.1] {
-        (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.1.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101))] (%8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)])
-            graph.CallGraphOp @model.layers.1.self_attn (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)])
-            linalg.CPU.AddOp <name="model.layers.1.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), )] (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8116:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%8150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=126))] (%8150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)])
-            graph.CallGraphOp @model.layers.1.mlp (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
-            linalg.CPU.AddOp <name="model.layers.1.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), )] (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
-            cf.ReturnOp (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) -> ()
+        (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.1.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=102, solved=0))] (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)])
+            graph.CallGraphOp @model.layers.1.self_attn (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)])
+            linalg.CPU.AddOp <name="model.layers.1.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)]) -> (%8364:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=128, solved=0))] (%8364:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)])
+            graph.CallGraphOp @model.layers.1.mlp (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)]) -> (%8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)])
+            linalg.CPU.AddOp <name="model.layers.1.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8364:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)]) -> (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.1.self_attn <CPU> [using_qnn:true, symbol:model.layers.1.self_attn] {
-        (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) {
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.q_proj">(%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)]) -> (%8118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=102))] (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)]) -> (%8119:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=104))] (%8117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=100)]) -> (%8120:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), )] (%8118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)]) -> (%8118:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), )] (%8118:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)]) -> (%8121:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), )] (%8119:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)]) -> (%8119:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), )] (%8119:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)]) -> (%8122:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), )] (%8120:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)]) -> (%8120:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), )] (%8120:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)]) -> (%8123:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=106), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=108))] (%8121:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=106)]) -> (%8124:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=110))] (%8122:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=103)]) -> (%8125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
-            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), )] (%8124:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)])
-            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), )] (%8125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), outputs_0:QuantSpec(Raw(type: Float16), uuid=111), )] (%8127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)]) -> (%8128:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=111)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=111), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), )] (%8128:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=111)]) -> (%8129:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), )] (%8129:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)]) -> (%8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105), outputs_0:QuantSpec(Raw(type: Float16), uuid=113), )] (%8123:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=105)]) -> (%8131:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=113), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114), )] (%8131:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)]) -> (%8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)])
-            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%8017:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)]) -> (%8133:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
-            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%8018:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) -> (%8134:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
-            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%8133:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8135:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
-            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%8134:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8136:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
-            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), )] (%8126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)], %8135:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8137:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)])
-            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), inputs_1:QuantSpec(Raw(type: Float32), uuid=116), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), )] (%8137:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)], %8138:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=116), constant:[0.088388346]]) -> (%8139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)])
-            linalg.CPU.ReduceMinOp <name="model.layers.1.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%8139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)]) -> (%8140:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
-            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), inputs_1:QuantSpec(Raw(type: Int16), uuid=118), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%8140:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)], %8141:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=118), constant:[-20]]) -> (%8142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
-            linalg.CPU.EqualOp <name="model.layers.1.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=119), outputs_0:QuantSpec(Raw(type: UInt8), uuid=120), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8143:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=119), constant:[0]]) -> (%8144:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=120)])
-            linalg.CPU.WhereOp <name="model.layers.1.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=120), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%8144:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=120)], %8139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=115)], %8142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%8145:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
-            linalg.CPU.SoftmaxOp <name="model.layers.1.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), )] (%8145:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%8146:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)])
-            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), )] (%8146:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=121)], %8136:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%8147:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), )] (%8147:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%8148:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), )] (%8148:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%8148:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=123))] (%8148:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
-            cf.ReturnOp (%8149:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %8130:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=112)], %8132:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=114)]) -> ()
+        (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)]) {
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=103, solved=0))] (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)]) -> (%8322:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=105, solved=0))] (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)]) -> (%8323:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=107, solved=0))] (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)]) -> (%8324:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), )] (%8322:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)]) -> (%8322:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), )] (%8322:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)]) -> (%8325:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), )] (%8323:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)]) -> (%8323:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), )] (%8323:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)]) -> (%8326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), )] (%8324:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)]) -> (%8324:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), )] (%8324:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)]) -> (%8327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=110, solved=0))] (%8325:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)]) -> (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=112, solved=0))] (%8326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)]) -> (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.NegOp <name="model.layers.1.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8330:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8330:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8331:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8331:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8332:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8333:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8333:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8332:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8334:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.NegOp <name="model.layers.1.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8335:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8335:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8336:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8336:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8337:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8338:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8338:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8337:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8339:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=113, solved=0), )] (%8339:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8340:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=113, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), )] (%8340:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)]) -> (%8341:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), )] (%8341:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)]) -> (%8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=115, solved=0), )] (%8327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)]) -> (%8344:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=115)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=115, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116, solved=0), )] (%8344:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=115)]) -> (%8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), )] (%8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)]) -> (%8347:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), )] (%8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)]) -> (%8348:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)])
+            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), )] (%8347:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8349:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
+            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), )] (%8348:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8350:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)])
+            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), )] (%8334:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8349:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8351:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=118, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), )] (%8351:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)], %8352:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=118), constant:[0.088388346]]) -> (%8353:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)])
+            linalg.CPU.ReduceMinOp <name="model.layers.1.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), )] (%8353:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)]) -> (%8354:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=120, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), )] (%8354:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)], %8355:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=120), constant:[-20]]) -> (%8356:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)])
+            linalg.CPU.EqualOp <name="model.layers.1.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=121, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=122, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8357:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=121), constant:[0]]) -> (%8358:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=122)])
+            linalg.CPU.WhereOp <name="model.layers.1.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=122, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), )] (%8358:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=122)], %8353:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)], %8356:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)]) -> (%8359:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)])
+            linalg.CPU.SoftmaxOp <name="model.layers.1.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123, solved=0), )] (%8359:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)]) -> (%8360:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123)])
+            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), )] (%8360:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123)], %8350:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8361:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), )] (%8361:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)]) -> (%8362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), )] (%8362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)]) -> (%8362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=125, solved=0))] (%8362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)]) -> (%8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)])
+            cf.ReturnOp (%8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.1.mlp <CPU> [using_qnn:true, symbol:model.layers.1.mlp] {
-        (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) {
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=127))] (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8152:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)])
-            linalg.CPU.SiLUOp <name="model.layers.1.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), )] (%8152:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=128)]) -> (%8153:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=130))] (%8151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%8154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)])
-            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), )] (%8153:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)], %8154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)]) -> (%8155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=132))] (%8155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)]) -> (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)])
-            cf.ReturnOp (%8156:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> ()
+        (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)]) -> (%8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)]) {
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=129, solved=0))] (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)]) -> (%8366:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=131, solved=0))] (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)]) -> (%8367:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)])
+            linalg.CPU.SigmoidOp <name="model.layers.1.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133, solved=0), )] (%8367:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)]) -> (%8368:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133)])
+            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), )] (%8367:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)], %8368:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133)]) -> (%8369:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)])
+            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), )] (%8369:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)], %8366:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130)]) -> (%8370:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=134, solved=0))] (%8370:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)]) -> (%8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)])
+            cf.ReturnOp (%8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.2 <CPU> [using_qnn:true, symbol:model.layers.2] {
-        (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.2.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=135))] (%8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)])
-            graph.CallGraphOp @model.layers.2.self_attn (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)])
-            linalg.CPU.AddOp <name="model.layers.2.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), )] (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8157:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=133)]) -> (%8191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=160))] (%8191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)])
-            graph.CallGraphOp @model.layers.2.mlp (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
-            linalg.CPU.AddOp <name="model.layers.2.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), )] (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)]) -> (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
-            cf.ReturnOp (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) -> ()
+        (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.2.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=137, solved=0))] (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)])
+            graph.CallGraphOp @model.layers.2.self_attn (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)])
+            linalg.CPU.AddOp <name="model.layers.2.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)]) -> (%8416:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=163, solved=0))] (%8416:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)])
+            graph.CallGraphOp @model.layers.2.mlp (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)]) -> (%8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)])
+            linalg.CPU.AddOp <name="model.layers.2.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8416:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)]) -> (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.2.self_attn <CPU> [using_qnn:true, symbol:model.layers.2.self_attn] {
-        (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) {
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.q_proj">(%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)]) -> (%8159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=136))] (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)]) -> (%8160:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138))] (%8158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=134)]) -> (%8161:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), )] (%8159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)]) -> (%8159:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), )] (%8159:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)]) -> (%8162:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), )] (%8160:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%8160:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), )] (%8160:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%8163:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%8161:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%8161:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%8161:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%8164:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=140), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=142))] (%8162:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=140)]) -> (%8165:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144))] (%8163:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%8166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)])
-            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), )] (%8165:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)])
-            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), )] (%8166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), outputs_0:QuantSpec(Raw(type: Float16), uuid=145), )] (%8168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)]) -> (%8169:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=145)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=145), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), )] (%8169:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=145)]) -> (%8170:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), )] (%8170:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)]) -> (%8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(Raw(type: Float16), uuid=147), )] (%8164:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%8172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=147)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=147), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148), )] (%8172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=147)]) -> (%8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)])
-            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%8019:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)]) -> (%8174:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
-            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%8020:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) -> (%8175:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
-            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%8174:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8176:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
-            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%8175:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8177:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
-            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), )] (%8167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=141)], %8176:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8178:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)])
-            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), inputs_1:QuantSpec(Raw(type: Float32), uuid=150), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), )] (%8178:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)], %8179:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=150), constant:[0.088388346]]) -> (%8180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)])
-            linalg.CPU.ReduceMinOp <name="model.layers.2.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), )] (%8180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)]) -> (%8181:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)])
-            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), inputs_1:QuantSpec(Raw(type: Int16), uuid=152), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), )] (%8181:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)], %8182:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=152), constant:[-20]]) -> (%8183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)])
-            linalg.CPU.EqualOp <name="model.layers.2.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=153), outputs_0:QuantSpec(Raw(type: UInt8), uuid=154), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8184:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=153), constant:[0]]) -> (%8185:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=154)])
-            linalg.CPU.WhereOp <name="model.layers.2.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=154), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), )] (%8185:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=154)], %8180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)], %8183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)]) -> (%8186:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)])
-            linalg.CPU.SoftmaxOp <name="model.layers.2.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), )] (%8186:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=151)]) -> (%8187:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)])
-            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), )] (%8187:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)], %8177:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%8188:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), )] (%8188:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)]) -> (%8189:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), )] (%8189:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)]) -> (%8189:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=157))] (%8189:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=156)]) -> (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)])
-            cf.ReturnOp (%8190:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=158)], %8171:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=146)], %8173:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=148)]) -> ()
+        (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)]) {
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138, solved=0))] (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)]) -> (%8374:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=140, solved=0))] (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)]) -> (%8375:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=142, solved=0))] (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)]) -> (%8376:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), )] (%8374:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)]) -> (%8374:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), )] (%8374:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)]) -> (%8377:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), )] (%8375:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)]) -> (%8375:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), )] (%8375:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)]) -> (%8378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), )] (%8376:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)]) -> (%8376:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), )] (%8376:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)]) -> (%8379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=145, solved=0))] (%8377:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)]) -> (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=147, solved=0))] (%8378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)]) -> (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.NegOp <name="model.layers.2.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8382:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8382:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8383:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8383:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8384:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8385:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8385:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8384:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8386:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.NegOp <name="model.layers.2.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8387:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8387:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8388:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8388:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8389:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8390:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8390:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8389:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8391:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=148, solved=0), )] (%8391:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=148)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=148, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), )] (%8392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=148)]) -> (%8393:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), )] (%8393:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)]) -> (%8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=150, solved=0), )] (%8379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)]) -> (%8396:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=150)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=150, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151, solved=0), )] (%8396:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=150)]) -> (%8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), )] (%8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)]) -> (%8399:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), )] (%8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)]) -> (%8400:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)])
+            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), )] (%8399:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8401:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
+            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), )] (%8400:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8402:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)])
+            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), )] (%8386:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8401:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8403:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=153, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), )] (%8403:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)], %8404:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=153), constant:[0.088388346]]) -> (%8405:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)])
+            linalg.CPU.ReduceMinOp <name="model.layers.2.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), )] (%8405:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)]) -> (%8406:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=155, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), )] (%8406:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)], %8407:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=155), constant:[-20]]) -> (%8408:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)])
+            linalg.CPU.EqualOp <name="model.layers.2.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=156, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=157, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8409:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=156), constant:[0]]) -> (%8410:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=157)])
+            linalg.CPU.WhereOp <name="model.layers.2.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=157, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), )] (%8410:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=157)], %8405:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)], %8408:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)]) -> (%8411:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)])
+            linalg.CPU.SoftmaxOp <name="model.layers.2.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158, solved=0), )] (%8411:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)]) -> (%8412:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158)])
+            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), )] (%8412:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158)], %8402:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8413:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), )] (%8413:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)]) -> (%8414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), )] (%8414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)]) -> (%8414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=160, solved=0))] (%8414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)]) -> (%8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)])
+            cf.ReturnOp (%8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.2.mlp <CPU> [using_qnn:true, symbol:model.layers.2.mlp] {
-        (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) {
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=161))] (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8193:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)])
-            linalg.CPU.SiLUOp <name="model.layers.2.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), )] (%8193:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)]) -> (%8194:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)])
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164))] (%8192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%8195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165)])
-            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), )] (%8194:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)], %8195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=165)]) -> (%8196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)])
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166))] (%8196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=163)]) -> (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
-            cf.ReturnOp (%8197:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> ()
+        (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)]) -> (%8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)]) {
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164, solved=0))] (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)]) -> (%8418:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166, solved=0))] (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)]) -> (%8419:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)])
+            linalg.CPU.SigmoidOp <name="model.layers.2.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168, solved=0), )] (%8419:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)]) -> (%8420:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168)])
+            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), )] (%8419:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)], %8420:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168)]) -> (%8421:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)])
+            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), )] (%8421:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)], %8418:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165)]) -> (%8422:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=169, solved=0))] (%8422:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)]) -> (%8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)])
+            cf.ReturnOp (%8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.3 <CPU> [using_qnn:true, symbol:model.layers.3] {
-        (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.3.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169))] (%8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)])
-            graph.CallGraphOp @model.layers.3.self_attn (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)])
-            linalg.CPU.AddOp <name="model.layers.3.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), )] (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8198:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%8232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=194))] (%8232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)])
-            graph.CallGraphOp @model.layers.3.mlp (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
-            linalg.CPU.AddOp <name="model.layers.3.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), )] (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
-            cf.ReturnOp (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) -> ()
+        (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.3.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=172, solved=0))] (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)])
+            graph.CallGraphOp @model.layers.3.self_attn (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)])
+            linalg.CPU.AddOp <name="model.layers.3.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)]) -> (%8468:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=198, solved=0))] (%8468:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)])
+            graph.CallGraphOp @model.layers.3.mlp (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)]) -> (%8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)])
+            linalg.CPU.AddOp <name="model.layers.3.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8468:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)]) -> (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.3.self_attn <CPU> [using_qnn:true, symbol:model.layers.3.self_attn] {
-        (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) {
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.q_proj">(%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)]) -> (%8200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=170))] (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)]) -> (%8201:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=172))] (%8199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=168)]) -> (%8202:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), )] (%8200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)]) -> (%8200:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), )] (%8200:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)]) -> (%8203:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), )] (%8201:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)]) -> (%8201:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), )] (%8201:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)]) -> (%8204:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), )] (%8202:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)]) -> (%8202:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), )] (%8202:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)]) -> (%8205:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=174), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=176))] (%8203:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=174)]) -> (%8206:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=178))] (%8204:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=171)]) -> (%8207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)])
-            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), )] (%8206:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)])
-            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), )] (%8207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), outputs_0:QuantSpec(Raw(type: Float16), uuid=179), )] (%8209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)]) -> (%8210:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=179)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=179), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), )] (%8210:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=179)]) -> (%8211:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), )] (%8211:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)]) -> (%8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), outputs_0:QuantSpec(Raw(type: Float16), uuid=181), )] (%8205:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)]) -> (%8213:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=181)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=181), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182), )] (%8213:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=181)]) -> (%8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)])
-            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%8021:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)]) -> (%8215:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
-            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%8022:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) -> (%8216:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
-            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%8215:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8217:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
-            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%8216:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8218:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
-            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), )] (%8208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=175)], %8217:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8219:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)])
-            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), inputs_1:QuantSpec(Raw(type: Float32), uuid=184), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), )] (%8219:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)], %8220:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=184), constant:[0.088388346]]) -> (%8221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)])
-            linalg.CPU.ReduceMinOp <name="model.layers.3.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%8221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)]) -> (%8222:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
-            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), inputs_1:QuantSpec(Raw(type: Int16), uuid=186), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%8222:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)], %8223:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=186), constant:[-20]]) -> (%8224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
-            linalg.CPU.EqualOp <name="model.layers.3.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=187), outputs_0:QuantSpec(Raw(type: UInt8), uuid=188), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8225:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=187), constant:[0]]) -> (%8226:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=188)])
-            linalg.CPU.WhereOp <name="model.layers.3.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=188), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%8226:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=188)], %8221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=183)], %8224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%8227:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
-            linalg.CPU.SoftmaxOp <name="model.layers.3.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%8227:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%8228:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), )] (%8228:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)], %8218:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%8229:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), )] (%8229:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)]) -> (%8230:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), )] (%8230:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)]) -> (%8230:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=191))] (%8230:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=190)]) -> (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
-            cf.ReturnOp (%8231:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %8212:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=180)], %8214:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=182)]) -> ()
+        (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)]) {
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=173, solved=0))] (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)]) -> (%8426:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=175, solved=0))] (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)]) -> (%8427:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=177, solved=0))] (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)]) -> (%8428:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), )] (%8426:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)]) -> (%8426:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), )] (%8426:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)]) -> (%8429:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), )] (%8427:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)]) -> (%8427:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), )] (%8427:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)]) -> (%8430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), )] (%8428:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)]) -> (%8428:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), )] (%8428:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)]) -> (%8431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=180, solved=0))] (%8429:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)]) -> (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=182, solved=0))] (%8430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)]) -> (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.NegOp <name="model.layers.3.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8434:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8434:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8435:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8435:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8436:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8437:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8437:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8436:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8438:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.NegOp <name="model.layers.3.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8439:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8439:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8440:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8440:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8441:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8442:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8442:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8441:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8443:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=183, solved=0), )] (%8443:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8444:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=183)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=183, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), )] (%8444:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=183)]) -> (%8445:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), )] (%8445:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)]) -> (%8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=185, solved=0), )] (%8431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)]) -> (%8448:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=185)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=185, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186, solved=0), )] (%8448:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=185)]) -> (%8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), )] (%8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)]) -> (%8451:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), )] (%8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)]) -> (%8452:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)])
+            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), )] (%8451:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8453:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
+            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), )] (%8452:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8454:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)])
+            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), )] (%8438:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8453:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8455:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=188, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), )] (%8455:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)], %8456:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=188), constant:[0.088388346]]) -> (%8457:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)])
+            linalg.CPU.ReduceMinOp <name="model.layers.3.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), )] (%8457:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)]) -> (%8458:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=190, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), )] (%8458:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)], %8459:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=190), constant:[-20]]) -> (%8460:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)])
+            linalg.CPU.EqualOp <name="model.layers.3.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=191, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=192, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8461:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=191), constant:[0]]) -> (%8462:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=192)])
+            linalg.CPU.WhereOp <name="model.layers.3.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=192, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), )] (%8462:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=192)], %8457:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)], %8460:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)]) -> (%8463:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)])
+            linalg.CPU.SoftmaxOp <name="model.layers.3.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193, solved=0), )] (%8463:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)]) -> (%8464:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193)])
+            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), )] (%8464:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193)], %8454:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8465:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), )] (%8465:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)]) -> (%8466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), )] (%8466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)]) -> (%8466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195, solved=0))] (%8466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)]) -> (%8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)])
+            cf.ReturnOp (%8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.3.mlp <CPU> [using_qnn:true, symbol:model.layers.3.mlp] {
-        (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) {
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195))] (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8234:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)])
-            linalg.CPU.SiLUOp <name="model.layers.3.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), )] (%8234:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=196)]) -> (%8235:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)])
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=198))] (%8233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=193)]) -> (%8236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)])
-            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), )] (%8235:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)], %8236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)]) -> (%8237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)])
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=200))] (%8237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)]) -> (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)])
-            cf.ReturnOp (%8238:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> ()
+        (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)]) -> (%8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)]) {
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=199, solved=0))] (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)]) -> (%8470:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=201, solved=0))] (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)]) -> (%8471:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)])
+            linalg.CPU.SigmoidOp <name="model.layers.3.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203, solved=0), )] (%8471:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)]) -> (%8472:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203)])
+            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), )] (%8471:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)], %8472:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203)]) -> (%8473:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)])
+            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), )] (%8473:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)], %8470:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200)]) -> (%8474:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204, solved=0))] (%8474:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)]) -> (%8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)])
+            cf.ReturnOp (%8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.4 <CPU> [using_qnn:true, symbol:model.layers.4] {
-        (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.4.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203))] (%8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)])
-            graph.CallGraphOp @model.layers.4.self_attn (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)])
-            linalg.CPU.AddOp <name="model.layers.4.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), )] (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8239:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=201)]) -> (%8273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=228))] (%8273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)])
-            graph.CallGraphOp @model.layers.4.mlp (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
-            linalg.CPU.AddOp <name="model.layers.4.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), )] (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)]) -> (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
-            cf.ReturnOp (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) -> ()
+        (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.4.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=207, solved=0))] (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)])
+            graph.CallGraphOp @model.layers.4.self_attn (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)])
+            linalg.CPU.AddOp <name="model.layers.4.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)]) -> (%8520:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=233, solved=0))] (%8520:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)])
+            graph.CallGraphOp @model.layers.4.mlp (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)]) -> (%8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)])
+            linalg.CPU.AddOp <name="model.layers.4.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8520:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)]) -> (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.4.self_attn <CPU> [using_qnn:true, symbol:model.layers.4.self_attn] {
-        (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) {
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.q_proj">(%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)]) -> (%8241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204))] (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)]) -> (%8242:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=206))] (%8240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=202)]) -> (%8243:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), )] (%8241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)]) -> (%8241:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), )] (%8241:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)]) -> (%8244:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), )] (%8242:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)]) -> (%8242:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), )] (%8242:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)]) -> (%8245:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), )] (%8243:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%8243:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), )] (%8243:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%8246:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=208), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=210))] (%8244:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=208)]) -> (%8247:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212))] (%8245:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=205)]) -> (%8248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)])
-            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), )] (%8247:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)])
-            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), )] (%8248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211), outputs_0:QuantSpec(Raw(type: Float16), uuid=213), )] (%8250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=211)]) -> (%8251:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=213)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=213), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), )] (%8251:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=213)]) -> (%8252:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), )] (%8252:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)]) -> (%8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(Raw(type: Float16), uuid=215), )] (%8246:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%8254:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=215)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=215), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216), )] (%8254:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=215)]) -> (%8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)])
-            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%8023:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)]) -> (%8256:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
-            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%8024:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) -> (%8257:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
-            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%8256:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8258:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
-            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%8257:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8259:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
-            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), )] (%8249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)], %8258:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8260:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
-            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), inputs_1:QuantSpec(Raw(type: Float32), uuid=218), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), )] (%8260:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)], %8261:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=218), constant:[0.088388346]]) -> (%8262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
-            linalg.CPU.ReduceMinOp <name="model.layers.4.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%8262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)]) -> (%8263:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
-            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), inputs_1:QuantSpec(Raw(type: Int16), uuid=220), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%8263:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)], %8264:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=220), constant:[-20]]) -> (%8265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
-            linalg.CPU.EqualOp <name="model.layers.4.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=221), outputs_0:QuantSpec(Raw(type: UInt8), uuid=222), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8266:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=221), constant:[0]]) -> (%8267:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=222)])
-            linalg.CPU.WhereOp <name="model.layers.4.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=222), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%8267:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=222)], %8262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)], %8265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%8268:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
-            linalg.CPU.SoftmaxOp <name="model.layers.4.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), )] (%8268:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%8269:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)])
-            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), )] (%8269:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=223)], %8259:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%8270:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), )] (%8270:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)]) -> (%8271:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), )] (%8271:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)]) -> (%8271:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=225))] (%8271:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=224)]) -> (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)])
-            cf.ReturnOp (%8272:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=226)], %8253:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=214)], %8255:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=216)]) -> ()
+        (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)]) {
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=208, solved=0))] (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)]) -> (%8478:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=210, solved=0))] (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)]) -> (%8479:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=212, solved=0))] (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)]) -> (%8480:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), )] (%8478:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)]) -> (%8478:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), )] (%8478:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)]) -> (%8481:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), )] (%8479:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)]) -> (%8479:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), )] (%8479:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)]) -> (%8482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), )] (%8480:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)]) -> (%8480:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), )] (%8480:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)]) -> (%8483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=215, solved=0))] (%8481:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)]) -> (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=217, solved=0))] (%8482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)]) -> (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.NegOp <name="model.layers.4.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8486:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8486:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8487:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8487:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8488:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8489:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8489:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8488:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8490:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.NegOp <name="model.layers.4.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8491:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8491:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8492:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8492:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8493:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8494:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8494:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8493:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8495:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=218, solved=0), )] (%8495:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8496:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=218)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=218, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), )] (%8496:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=218)]) -> (%8497:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), )] (%8497:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)]) -> (%8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=220, solved=0), )] (%8483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)]) -> (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=220)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=220, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221, solved=0), )] (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=220)]) -> (%8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), )] (%8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)]) -> (%8503:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), )] (%8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)]) -> (%8504:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)])
+            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), )] (%8503:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8505:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
+            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), )] (%8504:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8506:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)])
+            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), )] (%8490:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8505:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8507:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=223, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), )] (%8507:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)], %8508:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=223), constant:[0.088388346]]) -> (%8509:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)])
+            linalg.CPU.ReduceMinOp <name="model.layers.4.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), )] (%8509:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)]) -> (%8510:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=225, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), )] (%8510:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)], %8511:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=225), constant:[-20]]) -> (%8512:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)])
+            linalg.CPU.EqualOp <name="model.layers.4.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=226, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=227, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8513:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=226), constant:[0]]) -> (%8514:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=227)])
+            linalg.CPU.WhereOp <name="model.layers.4.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=227, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), )] (%8514:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=227)], %8509:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)], %8512:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)]) -> (%8515:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)])
+            linalg.CPU.SoftmaxOp <name="model.layers.4.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228, solved=0), )] (%8515:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)]) -> (%8516:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228)])
+            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), )] (%8516:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228)], %8506:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8517:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), )] (%8517:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)]) -> (%8518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), )] (%8518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)]) -> (%8518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=230, solved=0))] (%8518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)]) -> (%8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)])
+            cf.ReturnOp (%8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.4.mlp <CPU> [using_qnn:true, symbol:model.layers.4.mlp] {
-        (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) {
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=229))] (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8275:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)])
-            linalg.CPU.SiLUOp <name="model.layers.4.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), )] (%8275:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=230)]) -> (%8276:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)])
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=232))] (%8274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%8277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233)])
-            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), )] (%8276:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)], %8277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233)]) -> (%8278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)])
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234))] (%8278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=231)]) -> (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)])
-            cf.ReturnOp (%8279:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> ()
+        (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)]) -> (%8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)]) {
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234, solved=0))] (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)]) -> (%8522:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=236, solved=0))] (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)]) -> (%8523:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)])
+            linalg.CPU.SigmoidOp <name="model.layers.4.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238, solved=0), )] (%8523:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)]) -> (%8524:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238)])
+            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), )] (%8523:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)], %8524:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238)]) -> (%8525:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)])
+            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), )] (%8525:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)], %8522:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235)]) -> (%8526:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=239, solved=0))] (%8526:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)]) -> (%8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)])
+            cf.ReturnOp (%8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.5 <CPU> [using_qnn:true, symbol:model.layers.5] {
-        (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.5.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237))] (%8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)])
-            graph.CallGraphOp @model.layers.5.self_attn (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)])
-            linalg.CPU.AddOp <name="model.layers.5.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), )] (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8280:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=235)]) -> (%8314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=262))] (%8314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)])
-            graph.CallGraphOp @model.layers.5.mlp (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
-            linalg.CPU.AddOp <name="model.layers.5.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), )] (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)]) -> (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
-            cf.ReturnOp (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) -> ()
+        (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.5.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=242, solved=0))] (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)])
+            graph.CallGraphOp @model.layers.5.self_attn (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)])
+            linalg.CPU.AddOp <name="model.layers.5.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)]) -> (%8572:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=268, solved=0))] (%8572:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)])
+            graph.CallGraphOp @model.layers.5.mlp (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)]) -> (%8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)])
+            linalg.CPU.AddOp <name="model.layers.5.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8572:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)]) -> (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.5.self_attn <CPU> [using_qnn:true, symbol:model.layers.5.self_attn] {
-        (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) {
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.q_proj">(%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%8282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=238))] (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%8283:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=240))] (%8281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%8284:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), )] (%8282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)]) -> (%8282:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), )] (%8282:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)]) -> (%8285:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), )] (%8283:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%8283:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), )] (%8283:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%8286:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), )] (%8284:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)]) -> (%8284:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), )] (%8284:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)]) -> (%8287:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=242), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=244))] (%8285:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=242)]) -> (%8288:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=246))] (%8286:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%8289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)])
-            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), )] (%8288:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)])
-            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), )] (%8289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), outputs_0:QuantSpec(Raw(type: Float16), uuid=247), )] (%8291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)]) -> (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=247)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), )] (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=247)]) -> (%8293:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), )] (%8293:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)]) -> (%8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241), outputs_0:QuantSpec(Raw(type: Float16), uuid=249), )] (%8287:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=241)]) -> (%8295:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=249)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=249), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250), )] (%8295:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=249)]) -> (%8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)])
-            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%8025:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)]) -> (%8297:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
-            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%8026:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) -> (%8298:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
-            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%8297:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8299:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
-            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%8298:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8300:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
-            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), )] (%8290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=243)], %8299:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8301:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)])
-            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), inputs_1:QuantSpec(Raw(type: Float32), uuid=252), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), )] (%8301:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)], %8302:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=252), constant:[0.088388346]]) -> (%8303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)])
-            linalg.CPU.ReduceMinOp <name="model.layers.5.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), )] (%8303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)]) -> (%8304:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)])
-            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), inputs_1:QuantSpec(Raw(type: Int16), uuid=254), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), )] (%8304:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)], %8305:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=254), constant:[-20]]) -> (%8306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)])
-            linalg.CPU.EqualOp <name="model.layers.5.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=255), outputs_0:QuantSpec(Raw(type: UInt8), uuid=256), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8307:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=255), constant:[0]]) -> (%8308:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=256)])
-            linalg.CPU.WhereOp <name="model.layers.5.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=256), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), )] (%8308:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=256)], %8303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)], %8306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)]) -> (%8309:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)])
-            linalg.CPU.SoftmaxOp <name="model.layers.5.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%8309:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=253)]) -> (%8310:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
-            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), )] (%8310:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)], %8300:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%8311:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), )] (%8311:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)]) -> (%8312:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), )] (%8312:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)]) -> (%8312:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=259))] (%8312:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=258)]) -> (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)])
-            cf.ReturnOp (%8313:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=260)], %8294:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=248)], %8296:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=250)]) -> ()
+        (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)]) {
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=243, solved=0))] (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)]) -> (%8530:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=245, solved=0))] (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)]) -> (%8531:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=247, solved=0))] (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)]) -> (%8532:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), )] (%8530:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)]) -> (%8530:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), )] (%8530:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)]) -> (%8533:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), )] (%8531:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)]) -> (%8531:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), )] (%8531:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)]) -> (%8534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), )] (%8532:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)]) -> (%8532:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), )] (%8532:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)]) -> (%8535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=250, solved=0))] (%8533:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)]) -> (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=252, solved=0))] (%8534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)]) -> (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.NegOp <name="model.layers.5.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8538:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8538:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8539:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8539:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8540:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8541:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8541:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8540:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8542:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.NegOp <name="model.layers.5.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8543:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8543:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8544:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8544:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8545:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8546:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8546:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8545:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8547:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=253, solved=0), )] (%8547:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8548:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=253)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=253, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), )] (%8548:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=253)]) -> (%8549:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), )] (%8549:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)]) -> (%8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=255, solved=0), )] (%8535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)]) -> (%8552:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=255)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=255, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256, solved=0), )] (%8552:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=255)]) -> (%8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), )] (%8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)]) -> (%8555:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), )] (%8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)]) -> (%8556:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)])
+            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), )] (%8555:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8557:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
+            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), )] (%8556:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8558:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)])
+            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), )] (%8542:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8557:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8559:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=258, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), )] (%8559:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)], %8560:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=258), constant:[0.088388346]]) -> (%8561:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)])
+            linalg.CPU.ReduceMinOp <name="model.layers.5.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), )] (%8561:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)]) -> (%8562:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=260, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), )] (%8562:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)], %8563:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=260), constant:[-20]]) -> (%8564:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)])
+            linalg.CPU.EqualOp <name="model.layers.5.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=261, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=262, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8565:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=261), constant:[0]]) -> (%8566:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=262)])
+            linalg.CPU.WhereOp <name="model.layers.5.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=262, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), )] (%8566:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=262)], %8561:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)], %8564:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)]) -> (%8567:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)])
+            linalg.CPU.SoftmaxOp <name="model.layers.5.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263, solved=0), )] (%8567:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)]) -> (%8568:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263)])
+            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), )] (%8568:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263)], %8558:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8569:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), )] (%8569:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)]) -> (%8570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), )] (%8570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)]) -> (%8570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265, solved=0))] (%8570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)]) -> (%8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)])
+            cf.ReturnOp (%8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.5.mlp <CPU> [using_qnn:true, symbol:model.layers.5.mlp] {
-        (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) {
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=263))] (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8316:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
-            linalg.CPU.SiLUOp <name="model.layers.5.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), )] (%8316:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)]) -> (%8317:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)])
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=266))] (%8315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=261)]) -> (%8318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)])
-            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), )] (%8317:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)], %8318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)]) -> (%8319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)])
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=268))] (%8319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=265)]) -> (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
-            cf.ReturnOp (%8320:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> ()
+        (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)]) -> (%8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)]) {
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=269, solved=0))] (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)]) -> (%8574:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271, solved=0))] (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)]) -> (%8575:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)])
+            linalg.CPU.SigmoidOp <name="model.layers.5.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273, solved=0), )] (%8575:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)]) -> (%8576:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273)])
+            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), )] (%8575:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)], %8576:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273)]) -> (%8577:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)])
+            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), )] (%8577:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)], %8574:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270)]) -> (%8578:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274, solved=0))] (%8578:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)]) -> (%8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)])
+            cf.ReturnOp (%8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.6 <CPU> [using_qnn:true, symbol:model.layers.6] {
-        (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.6.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=271))] (%8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)])
-            graph.CallGraphOp @model.layers.6.self_attn (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
-            linalg.CPU.AddOp <name="model.layers.6.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), )] (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8321:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%8355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296))] (%8355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)])
-            graph.CallGraphOp @model.layers.6.mlp (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
-            linalg.CPU.AddOp <name="model.layers.6.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), )] (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
-            cf.ReturnOp (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> ()
+        (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.6.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=277, solved=0))] (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)])
+            graph.CallGraphOp @model.layers.6.self_attn (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)])
+            linalg.CPU.AddOp <name="model.layers.6.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)]) -> (%8624:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=303, solved=0))] (%8624:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)])
+            graph.CallGraphOp @model.layers.6.mlp (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)]) -> (%8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)])
+            linalg.CPU.AddOp <name="model.layers.6.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8624:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)]) -> (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.6.self_attn <CPU> [using_qnn:true, symbol:model.layers.6.self_attn] {
-        (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) {
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.q_proj">(%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%8323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=272))] (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%8324:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274))] (%8322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%8325:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), )] (%8323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)]) -> (%8323:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), )] (%8323:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)]) -> (%8326:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), )] (%8324:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)]) -> (%8324:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), )] (%8324:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)]) -> (%8327:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), )] (%8325:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%8325:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), )] (%8325:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%8328:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=276), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=278))] (%8326:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=276)]) -> (%8329:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=280))] (%8327:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=273)]) -> (%8330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
-            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), )] (%8329:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
-            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), )] (%8330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), outputs_0:QuantSpec(Raw(type: Float16), uuid=281), )] (%8332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)]) -> (%8333:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=281)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=281), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), )] (%8333:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=281)]) -> (%8334:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), )] (%8334:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)]) -> (%8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(Raw(type: Float16), uuid=283), )] (%8328:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%8336:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=283)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=283), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), )] (%8336:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=283)]) -> (%8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
-            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%8027:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)]) -> (%8338:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
-            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%8028:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> (%8339:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
-            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%8338:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8340:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
-            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%8339:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8341:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
-            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), )] (%8331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)], %8340:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8342:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)])
-            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), inputs_1:QuantSpec(Raw(type: Float32), uuid=286), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), )] (%8342:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)], %8343:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=286), constant:[0.088388346]]) -> (%8344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)])
-            linalg.CPU.ReduceMinOp <name="model.layers.6.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%8344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)]) -> (%8345:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
-            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), inputs_1:QuantSpec(Raw(type: Int16), uuid=288), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%8345:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)], %8346:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=288), constant:[-20]]) -> (%8347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
-            linalg.CPU.EqualOp <name="model.layers.6.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=289), outputs_0:QuantSpec(Raw(type: UInt8), uuid=290), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8348:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=289), constant:[0]]) -> (%8349:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=290)])
-            linalg.CPU.WhereOp <name="model.layers.6.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=290), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%8349:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=290)], %8344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=285)], %8347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)]) -> (%8350:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
-            linalg.CPU.SoftmaxOp <name="model.layers.6.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), )] (%8350:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)]) -> (%8351:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)])
-            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), )] (%8351:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=291)], %8341:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%8352:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), )] (%8352:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)]) -> (%8353:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), )] (%8353:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)]) -> (%8353:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=293))] (%8353:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=292)]) -> (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
-            cf.ReturnOp (%8354:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)], %8335:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=282)], %8337:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> ()
+        (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)]) {
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=278, solved=0))] (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)]) -> (%8582:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=280, solved=0))] (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)]) -> (%8583:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=282, solved=0))] (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)]) -> (%8584:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), )] (%8582:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)]) -> (%8582:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), )] (%8582:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)]) -> (%8585:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), )] (%8583:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)]) -> (%8583:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), )] (%8583:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)]) -> (%8586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), )] (%8584:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)]) -> (%8584:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), )] (%8584:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)]) -> (%8587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=285, solved=0))] (%8585:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)]) -> (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=287, solved=0))] (%8586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)]) -> (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.NegOp <name="model.layers.6.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8590:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8590:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8591:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8591:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8592:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8593:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8593:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8592:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8594:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.NegOp <name="model.layers.6.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8595:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8595:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8596:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8596:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8597:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8598:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8598:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8597:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8599:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=288, solved=0), )] (%8599:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=288)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=288, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), )] (%8600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=288)]) -> (%8601:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), )] (%8601:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)]) -> (%8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=290, solved=0), )] (%8587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)]) -> (%8604:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=290)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=290, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291, solved=0), )] (%8604:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=290)]) -> (%8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), )] (%8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)]) -> (%8607:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), )] (%8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)]) -> (%8608:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)])
+            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), )] (%8607:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8609:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
+            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), )] (%8608:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8610:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)])
+            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), )] (%8594:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8609:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8611:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=293, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), )] (%8611:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)], %8612:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=293), constant:[0.088388346]]) -> (%8613:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)])
+            linalg.CPU.ReduceMinOp <name="model.layers.6.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), )] (%8613:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)]) -> (%8614:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=295, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), )] (%8614:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)], %8615:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=295), constant:[-20]]) -> (%8616:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)])
+            linalg.CPU.EqualOp <name="model.layers.6.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=296, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=297, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8617:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=296), constant:[0]]) -> (%8618:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=297)])
+            linalg.CPU.WhereOp <name="model.layers.6.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=297, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), )] (%8618:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=297)], %8613:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)], %8616:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)]) -> (%8619:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)])
+            linalg.CPU.SoftmaxOp <name="model.layers.6.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298, solved=0), )] (%8619:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)]) -> (%8620:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298)])
+            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), )] (%8620:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298)], %8610:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8621:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), )] (%8621:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)]) -> (%8622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), )] (%8622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)]) -> (%8622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300, solved=0))] (%8622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)]) -> (%8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)])
+            cf.ReturnOp (%8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.6.mlp <CPU> [using_qnn:true, symbol:model.layers.6.mlp] {
-        (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) {
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=297))] (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8357:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)])
-            linalg.CPU.SiLUOp <name="model.layers.6.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), )] (%8357:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=298)]) -> (%8358:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)])
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300))] (%8356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=295)]) -> (%8359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301)])
-            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), )] (%8358:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)], %8359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=301)]) -> (%8360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)])
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=302))] (%8360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)]) -> (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)])
-            cf.ReturnOp (%8361:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> ()
+        (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)]) -> (%8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)]) {
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=304, solved=0))] (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)]) -> (%8626:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306, solved=0))] (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)]) -> (%8627:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)])
+            linalg.CPU.SigmoidOp <name="model.layers.6.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308, solved=0), )] (%8627:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)]) -> (%8628:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308)])
+            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), )] (%8627:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)], %8628:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308)]) -> (%8629:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)])
+            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), )] (%8629:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)], %8626:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305)]) -> (%8630:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=309, solved=0))] (%8630:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)]) -> (%8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)])
+            cf.ReturnOp (%8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.7 <CPU> [using_qnn:true, symbol:model.layers.7] {
-        (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.7.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305))] (%8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)])
-            graph.CallGraphOp @model.layers.7.self_attn (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)])
-            linalg.CPU.AddOp <name="model.layers.7.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), )] (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8362:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=303)]) -> (%8396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330))] (%8396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)])
-            graph.CallGraphOp @model.layers.7.mlp (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            linalg.CPU.AddOp <name="model.layers.7.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), )] (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)]) -> (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            cf.ReturnOp (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) -> ()
+        (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.7.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=312, solved=0))] (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)])
+            graph.CallGraphOp @model.layers.7.self_attn (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)])
+            linalg.CPU.AddOp <name="model.layers.7.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)]) -> (%8676:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=338, solved=0))] (%8676:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)])
+            graph.CallGraphOp @model.layers.7.mlp (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)]) -> (%8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)])
+            linalg.CPU.AddOp <name="model.layers.7.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8676:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)]) -> (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.7.self_attn <CPU> [using_qnn:true, symbol:model.layers.7.self_attn] {
-        (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) {
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.q_proj">(%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%8364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306))] (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%8365:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=308))] (%8363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%8366:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), )] (%8364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%8364:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), )] (%8364:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%8367:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), )] (%8365:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%8365:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), )] (%8365:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%8368:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), )] (%8366:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%8366:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), )] (%8366:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%8369:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=312))] (%8367:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%8370:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=314))] (%8368:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%8371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)])
-            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), )] (%8370:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)])
-            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), )] (%8371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313), outputs_0:QuantSpec(Raw(type: Float16), uuid=315), )] (%8373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=313)]) -> (%8374:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=315)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=315), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), )] (%8374:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=315)]) -> (%8375:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), )] (%8375:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> (%8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(Raw(type: Float16), uuid=317), )] (%8369:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%8377:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=317)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=317), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318), )] (%8377:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=317)]) -> (%8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)])
-            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%8029:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> (%8379:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
-            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%8030:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) -> (%8380:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
-            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%8379:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8381:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
-            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%8380:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8382:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
-            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%8372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)], %8381:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8383:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
-            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), inputs_1:QuantSpec(Raw(type: Float32), uuid=320), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%8383:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)], %8384:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=320), constant:[0.088388346]]) -> (%8385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
-            linalg.CPU.ReduceMinOp <name="model.layers.7.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), )] (%8385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)]) -> (%8386:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)])
-            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), inputs_1:QuantSpec(Raw(type: Int16), uuid=322), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), )] (%8386:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)], %8387:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=322), constant:[-20]]) -> (%8388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)])
-            linalg.CPU.EqualOp <name="model.layers.7.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=323), outputs_0:QuantSpec(Raw(type: UInt8), uuid=324), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8389:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=323), constant:[0]]) -> (%8390:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=324)])
-            linalg.CPU.WhereOp <name="model.layers.7.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=324), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), )] (%8390:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=324)], %8385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)], %8388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)]) -> (%8391:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)])
-            linalg.CPU.SoftmaxOp <name="model.layers.7.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), )] (%8391:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=321)]) -> (%8392:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)])
-            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%8392:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=325)], %8382:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%8393:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%8393:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%8394:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%8394:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%8394:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=327))] (%8394:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)])
-            cf.ReturnOp (%8395:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=328)], %8376:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %8378:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=318)]) -> ()
+        (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)]) {
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=313, solved=0))] (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)]) -> (%8634:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=315, solved=0))] (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)]) -> (%8635:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=317, solved=0))] (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)]) -> (%8636:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), )] (%8634:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)]) -> (%8634:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), )] (%8634:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)]) -> (%8637:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), )] (%8635:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)]) -> (%8635:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), )] (%8635:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)]) -> (%8638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), )] (%8636:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)]) -> (%8636:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), )] (%8636:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)]) -> (%8639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=320, solved=0))] (%8637:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)]) -> (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=322, solved=0))] (%8638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)]) -> (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.NegOp <name="model.layers.7.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8642:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8642:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8643:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8643:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8644:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8645:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8645:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8644:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8646:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.NegOp <name="model.layers.7.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8647:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8647:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8648:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8648:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8649:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8650:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8650:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8649:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8651:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=323, solved=0), )] (%8651:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8652:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=323)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=323, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), )] (%8652:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=323)]) -> (%8653:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), )] (%8653:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)]) -> (%8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=325, solved=0), )] (%8639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)]) -> (%8656:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=325)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=325, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326, solved=0), )] (%8656:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=325)]) -> (%8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), )] (%8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)]) -> (%8659:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), )] (%8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)]) -> (%8660:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)])
+            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), )] (%8659:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8661:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
+            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), )] (%8660:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8662:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)])
+            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), )] (%8646:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8661:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8663:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=328, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), )] (%8663:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)], %8664:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=328), constant:[0.088388346]]) -> (%8665:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)])
+            linalg.CPU.ReduceMinOp <name="model.layers.7.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), )] (%8665:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)]) -> (%8666:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=330, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), )] (%8666:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)], %8667:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=330), constant:[-20]]) -> (%8668:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)])
+            linalg.CPU.EqualOp <name="model.layers.7.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=331, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=332, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8669:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=331), constant:[0]]) -> (%8670:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=332)])
+            linalg.CPU.WhereOp <name="model.layers.7.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=332, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), )] (%8670:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=332)], %8665:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)], %8668:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)]) -> (%8671:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)])
+            linalg.CPU.SoftmaxOp <name="model.layers.7.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333, solved=0), )] (%8671:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)]) -> (%8672:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333)])
+            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), )] (%8672:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333)], %8662:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8673:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), )] (%8673:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)]) -> (%8674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), )] (%8674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)]) -> (%8674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=335, solved=0))] (%8674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)]) -> (%8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)])
+            cf.ReturnOp (%8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.7.mlp <CPU> [using_qnn:true, symbol:model.layers.7.mlp] {
-        (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) {
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=331))] (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8398:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)])
-            linalg.CPU.SiLUOp <name="model.layers.7.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), )] (%8398:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)]) -> (%8399:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)])
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=334))] (%8397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%8400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)])
-            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), )] (%8399:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)], %8400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)]) -> (%8401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)])
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=336))] (%8401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=333)]) -> (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            cf.ReturnOp (%8402:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> ()
+        (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)]) -> (%8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)]) {
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=339, solved=0))] (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)]) -> (%8678:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=341, solved=0))] (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)]) -> (%8679:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)])
+            linalg.CPU.SigmoidOp <name="model.layers.7.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343, solved=0), )] (%8679:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)]) -> (%8680:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343)])
+            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), )] (%8679:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)], %8680:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343)]) -> (%8681:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)])
+            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), )] (%8681:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)], %8678:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340)]) -> (%8682:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=344, solved=0))] (%8682:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)]) -> (%8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)])
+            cf.ReturnOp (%8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.8 <CPU> [using_qnn:true, symbol:model.layers.8] {
-        (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.8.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339))] (%8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)])
-            graph.CallGraphOp @model.layers.8.self_attn (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)])
-            linalg.CPU.AddOp <name="model.layers.8.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), )] (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8403:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%8437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364))] (%8437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)])
-            graph.CallGraphOp @model.layers.8.mlp (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
-            linalg.CPU.AddOp <name="model.layers.8.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), )] (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
-            cf.ReturnOp (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) -> ()
+        (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.8.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=347, solved=0))] (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)])
+            graph.CallGraphOp @model.layers.8.self_attn (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)])
+            linalg.CPU.AddOp <name="model.layers.8.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)]) -> (%8728:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=373, solved=0))] (%8728:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)])
+            graph.CallGraphOp @model.layers.8.mlp (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)]) -> (%8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)])
+            linalg.CPU.AddOp <name="model.layers.8.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8728:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)]) -> (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.8.self_attn <CPU> [using_qnn:true, symbol:model.layers.8.self_attn] {
-        (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) {
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.q_proj">(%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)]) -> (%8405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=340))] (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)]) -> (%8406:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=342))] (%8404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=338)]) -> (%8407:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), )] (%8405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)]) -> (%8405:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), )] (%8405:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)]) -> (%8408:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), )] (%8406:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)]) -> (%8406:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), )] (%8406:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)]) -> (%8409:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), )] (%8407:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)]) -> (%8407:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), )] (%8407:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)]) -> (%8410:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=344), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=346))] (%8408:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=344)]) -> (%8411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=348))] (%8409:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)]) -> (%8412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)])
-            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), )] (%8411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)])
-            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), )] (%8412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), outputs_0:QuantSpec(Raw(type: Float16), uuid=349), )] (%8414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)]) -> (%8415:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=349)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=349), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), )] (%8415:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=349)]) -> (%8416:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), )] (%8416:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)]) -> (%8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343), outputs_0:QuantSpec(Raw(type: Float16), uuid=351), )] (%8410:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=343)]) -> (%8418:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=351)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=351), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352), )] (%8418:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=351)]) -> (%8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)])
-            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%8031:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)]) -> (%8420:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
-            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%8032:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) -> (%8421:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
-            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%8420:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8422:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
-            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%8421:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8423:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
-            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), )] (%8413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=345)], %8422:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8424:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)])
-            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), inputs_1:QuantSpec(Raw(type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), )] (%8424:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)], %8425:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=354), constant:[0.088388346]]) -> (%8426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)])
-            linalg.CPU.ReduceMinOp <name="model.layers.8.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), )] (%8426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)]) -> (%8427:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)])
-            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), inputs_1:QuantSpec(Raw(type: Int16), uuid=356), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), )] (%8427:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)], %8428:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=356), constant:[-20]]) -> (%8429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)])
-            linalg.CPU.EqualOp <name="model.layers.8.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=357), outputs_0:QuantSpec(Raw(type: UInt8), uuid=358), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8430:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=357), constant:[1]]) -> (%8431:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=358)])
-            linalg.CPU.WhereOp <name="model.layers.8.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=358), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), )] (%8431:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=358)], %8426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)], %8429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)]) -> (%8432:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)])
-            linalg.CPU.SoftmaxOp <name="model.layers.8.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), )] (%8432:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=355)]) -> (%8433:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)])
-            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%8433:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)], %8423:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%8434:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%8434:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%8435:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%8435:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%8435:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=361))] (%8435:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)])
-            cf.ReturnOp (%8436:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)], %8417:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=350)], %8419:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=352)]) -> ()
+        (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)]) {
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=348, solved=0))] (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)]) -> (%8686:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=350, solved=0))] (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)]) -> (%8687:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=352, solved=0))] (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)]) -> (%8688:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), )] (%8686:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)]) -> (%8686:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), )] (%8686:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)]) -> (%8689:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), )] (%8687:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)]) -> (%8687:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), )] (%8687:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)]) -> (%8690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), )] (%8688:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)]) -> (%8688:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), )] (%8688:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)]) -> (%8691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=355, solved=0))] (%8689:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)]) -> (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=357, solved=0))] (%8690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)]) -> (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.NegOp <name="model.layers.8.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8694:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8694:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8695:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8695:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8696:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8697:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8697:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8696:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8698:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.NegOp <name="model.layers.8.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8699:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8699:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8700:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8700:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8701:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8702:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8702:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8701:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8703:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=358, solved=0), )] (%8703:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8704:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=358)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=358, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), )] (%8704:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=358)]) -> (%8705:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), )] (%8705:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)]) -> (%8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=360, solved=0), )] (%8691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)]) -> (%8708:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=360)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=360, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361, solved=0), )] (%8708:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=360)]) -> (%8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), )] (%8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)]) -> (%8711:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), )] (%8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)]) -> (%8712:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)])
+            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), )] (%8711:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8713:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
+            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), )] (%8712:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8714:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)])
+            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), )] (%8698:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8713:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8715:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=363, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), )] (%8715:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)], %8716:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=363), constant:[0.088388346]]) -> (%8717:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)])
+            linalg.CPU.ReduceMinOp <name="model.layers.8.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), )] (%8717:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)]) -> (%8718:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=365, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), )] (%8718:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)], %8719:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=365), constant:[-20]]) -> (%8720:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)])
+            linalg.CPU.EqualOp <name="model.layers.8.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=366, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=367, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8721:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=366), constant:[0]]) -> (%8722:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=367)])
+            linalg.CPU.WhereOp <name="model.layers.8.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=367, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), )] (%8722:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=367)], %8717:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)], %8720:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)]) -> (%8723:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)])
+            linalg.CPU.SoftmaxOp <name="model.layers.8.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368, solved=0), )] (%8723:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)]) -> (%8724:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368)])
+            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), )] (%8724:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368)], %8714:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8725:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), )] (%8725:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)]) -> (%8726:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), )] (%8726:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)]) -> (%8726:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370, solved=0))] (%8726:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)]) -> (%8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)])
+            cf.ReturnOp (%8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.8.mlp <CPU> [using_qnn:true, symbol:model.layers.8.mlp] {
-        (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) {
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=365))] (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8439:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)])
-            linalg.CPU.SiLUOp <name="model.layers.8.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), )] (%8439:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=366)]) -> (%8440:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=368))] (%8438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=363)]) -> (%8441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)])
-            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), )] (%8440:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)], %8441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)]) -> (%8442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370))] (%8442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)]) -> (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
-            cf.ReturnOp (%8443:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> ()
+        (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)]) -> (%8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)]) {
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374, solved=0))] (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)]) -> (%8730:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376, solved=0))] (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)]) -> (%8731:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)])
+            linalg.CPU.SigmoidOp <name="model.layers.8.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378, solved=0), )] (%8731:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)]) -> (%8732:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378)])
+            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), )] (%8731:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)], %8732:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378)]) -> (%8733:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)])
+            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), )] (%8733:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)], %8730:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375)]) -> (%8734:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=379, solved=0))] (%8734:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)]) -> (%8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)])
+            cf.ReturnOp (%8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.9 <CPU> [using_qnn:true, symbol:model.layers.9] {
-        (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.9.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=373))] (%8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)])
-            graph.CallGraphOp @model.layers.9.self_attn (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)])
-            linalg.CPU.AddOp <name="model.layers.9.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), )] (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8444:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)]) -> (%8478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=398))] (%8478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)])
-            graph.CallGraphOp @model.layers.9.mlp (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
-            linalg.CPU.AddOp <name="model.layers.9.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), )] (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)]) -> (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
-            cf.ReturnOp (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) -> ()
+        (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.9.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=382, solved=0))] (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)])
+            graph.CallGraphOp @model.layers.9.self_attn (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)])
+            linalg.CPU.AddOp <name="model.layers.9.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)]) -> (%8780:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=408, solved=0))] (%8780:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)])
+            graph.CallGraphOp @model.layers.9.mlp (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)]) -> (%8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)])
+            linalg.CPU.AddOp <name="model.layers.9.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8780:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)]) -> (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.9.self_attn <CPU> [using_qnn:true, symbol:model.layers.9.self_attn] {
-        (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) {
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.q_proj">(%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%8446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374))] (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%8447:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376))] (%8445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%8448:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), )] (%8446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)]) -> (%8446:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), )] (%8446:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)]) -> (%8449:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), )] (%8447:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)]) -> (%8447:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), )] (%8447:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)]) -> (%8450:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), )] (%8448:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%8448:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), )] (%8448:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%8451:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=378), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=380))] (%8449:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=378)]) -> (%8452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=382))] (%8450:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=375)]) -> (%8453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)])
-            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), )] (%8452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
-            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), )] (%8453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381), outputs_0:QuantSpec(Raw(type: Float16), uuid=383), )] (%8455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=381)]) -> (%8456:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=383)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=383), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), )] (%8456:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=383)]) -> (%8457:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), )] (%8457:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)]) -> (%8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(Raw(type: Float16), uuid=385), )] (%8451:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%8459:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=385)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=385), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386), )] (%8459:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=385)]) -> (%8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)])
-            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%8033:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)]) -> (%8461:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
-            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%8034:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) -> (%8462:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
-            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%8461:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8463:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
-            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%8462:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8464:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
-            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), )] (%8454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)], %8463:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8465:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)])
-            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), inputs_1:QuantSpec(Raw(type: Float32), uuid=388), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), )] (%8465:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)], %8466:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=388), constant:[0.088388346]]) -> (%8467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)])
-            linalg.CPU.ReduceMinOp <name="model.layers.9.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), )] (%8467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)]) -> (%8468:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
-            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), inputs_1:QuantSpec(Raw(type: Int16), uuid=390), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), )] (%8468:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)], %8469:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=390), constant:[-20]]) -> (%8470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
-            linalg.CPU.EqualOp <name="model.layers.9.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=391), outputs_0:QuantSpec(Raw(type: UInt8), uuid=392), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8471:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=391), constant:[-0.1796875]]) -> (%8472:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=392)])
-            linalg.CPU.WhereOp <name="model.layers.9.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=392), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), )] (%8472:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=392)], %8467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)], %8470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)]) -> (%8473:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
-            linalg.CPU.SoftmaxOp <name="model.layers.9.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), )] (%8473:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)]) -> (%8474:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)])
-            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%8474:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=393)], %8464:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%8475:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%8475:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%8476:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%8476:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%8476:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=395))] (%8476:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)])
-            cf.ReturnOp (%8477:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=396)], %8458:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=384)], %8460:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=386)]) -> ()
+        (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)]) {
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=383, solved=0))] (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)]) -> (%8738:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=385, solved=0))] (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)]) -> (%8739:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=387, solved=0))] (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)]) -> (%8740:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), )] (%8738:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)]) -> (%8738:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), )] (%8738:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)]) -> (%8741:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), )] (%8739:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)]) -> (%8739:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), )] (%8739:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)]) -> (%8742:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), )] (%8740:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)]) -> (%8740:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), )] (%8740:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)]) -> (%8743:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=390, solved=0))] (%8741:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)]) -> (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=392, solved=0))] (%8742:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)]) -> (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.NegOp <name="model.layers.9.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8746:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8746:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8747:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8747:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8748:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8749:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8749:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8748:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8750:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.NegOp <name="model.layers.9.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8751:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8751:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8752:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8752:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8753:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8754:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8754:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8753:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8755:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=393, solved=0), )] (%8755:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8756:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=393)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=393, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), )] (%8756:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=393)]) -> (%8757:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), )] (%8757:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)]) -> (%8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=395, solved=0), )] (%8743:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)]) -> (%8760:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=395)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=395, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396, solved=0), )] (%8760:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=395)]) -> (%8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), )] (%8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)]) -> (%8763:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), )] (%8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)]) -> (%8764:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)])
+            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), )] (%8763:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8765:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
+            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), )] (%8764:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8766:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)])
+            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), )] (%8750:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8765:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8767:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=398, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), )] (%8767:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)], %8768:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=398), constant:[0.088388346]]) -> (%8769:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)])
+            linalg.CPU.ReduceMinOp <name="model.layers.9.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), )] (%8769:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)]) -> (%8770:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=400, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), )] (%8770:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)], %8771:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=400), constant:[-20]]) -> (%8772:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)])
+            linalg.CPU.EqualOp <name="model.layers.9.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=401, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=402, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8773:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=401), constant:[0]]) -> (%8774:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=402)])
+            linalg.CPU.WhereOp <name="model.layers.9.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=402, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), )] (%8774:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=402)], %8769:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)], %8772:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)]) -> (%8775:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)])
+            linalg.CPU.SoftmaxOp <name="model.layers.9.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403, solved=0), )] (%8775:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)]) -> (%8776:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403)])
+            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), )] (%8776:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403)], %8766:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8777:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), )] (%8777:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)]) -> (%8778:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), )] (%8778:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)]) -> (%8778:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=405, solved=0))] (%8778:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)]) -> (%8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)])
+            cf.ReturnOp (%8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.9.mlp <CPU> [using_qnn:true, symbol:model.layers.9.mlp] {
-        (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) {
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=399))] (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8480:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)])
-            linalg.CPU.SiLUOp <name="model.layers.9.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), )] (%8480:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=400)]) -> (%8481:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)])
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=402))] (%8479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%8482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403)])
-            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), )] (%8481:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)], %8482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=403)]) -> (%8483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)])
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=404))] (%8483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)]) -> (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)])
-            cf.ReturnOp (%8484:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> ()
+        (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)]) -> (%8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)]) {
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=409, solved=0))] (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)]) -> (%8782:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=411, solved=0))] (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)]) -> (%8783:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)])
+            linalg.CPU.SigmoidOp <name="model.layers.9.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413, solved=0), )] (%8783:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)]) -> (%8784:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413)])
+            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), )] (%8783:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)], %8784:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413)]) -> (%8785:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)])
+            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), )] (%8785:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)], %8782:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410)]) -> (%8786:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=414, solved=0))] (%8786:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)]) -> (%8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)])
+            cf.ReturnOp (%8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.10 <CPU> [using_qnn:true, symbol:model.layers.10] {
-        (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.10.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407))] (%8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)])
-            graph.CallGraphOp @model.layers.10.self_attn (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)])
-            linalg.CPU.AddOp <name="model.layers.10.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), )] (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8485:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=405)]) -> (%8519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432))] (%8519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)])
-            graph.CallGraphOp @model.layers.10.mlp (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            linalg.CPU.AddOp <name="model.layers.10.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), )] (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)]) -> (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            cf.ReturnOp (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) -> ()
+        (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.10.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=417, solved=0))] (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)])
+            graph.CallGraphOp @model.layers.10.self_attn (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)])
+            linalg.CPU.AddOp <name="model.layers.10.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)]) -> (%8832:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=443, solved=0))] (%8832:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)])
+            graph.CallGraphOp @model.layers.10.mlp (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)]) -> (%8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)])
+            linalg.CPU.AddOp <name="model.layers.10.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8832:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)]) -> (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.10.self_attn <CPU> [using_qnn:true, symbol:model.layers.10.self_attn] {
-        (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) {
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.q_proj">(%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)]) -> (%8487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=408))] (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)]) -> (%8488:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=410))] (%8486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=406)]) -> (%8489:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), )] (%8487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)]) -> (%8487:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), )] (%8487:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)]) -> (%8490:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%8488:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%8488:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%8488:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%8491:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), )] (%8489:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)]) -> (%8489:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), )] (%8489:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)]) -> (%8492:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=412), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=414))] (%8490:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=412)]) -> (%8493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416))] (%8491:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%8494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)])
-            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), )] (%8493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)])
-            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), )] (%8494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415), outputs_0:QuantSpec(Raw(type: Float16), uuid=417), )] (%8496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=415)]) -> (%8497:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=417)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=417), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), )] (%8497:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=417)]) -> (%8498:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), )] (%8498:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)]) -> (%8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411), outputs_0:QuantSpec(Raw(type: Float16), uuid=419), )] (%8492:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=411)]) -> (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=419)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=419), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420), )] (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=419)]) -> (%8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)])
-            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%8035:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)]) -> (%8502:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
-            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%8036:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) -> (%8503:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
-            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%8502:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8504:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
-            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%8503:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8505:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
-            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), )] (%8495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)], %8504:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8506:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)])
-            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), inputs_1:QuantSpec(Raw(type: Float32), uuid=422), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), )] (%8506:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)], %8507:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=422), constant:[0.088388346]]) -> (%8508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)])
-            linalg.CPU.ReduceMinOp <name="model.layers.10.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), )] (%8508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)]) -> (%8509:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)])
-            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), inputs_1:QuantSpec(Raw(type: Int16), uuid=424), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), )] (%8509:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)], %8510:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=424), constant:[-20]]) -> (%8511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)])
-            linalg.CPU.EqualOp <name="model.layers.10.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=425), outputs_0:QuantSpec(Raw(type: UInt8), uuid=426), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8512:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=425), constant:[-0.93359375]]) -> (%8513:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=426)])
-            linalg.CPU.WhereOp <name="model.layers.10.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=426), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), )] (%8513:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=426)], %8508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=421)], %8511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)]) -> (%8514:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)])
-            linalg.CPU.SoftmaxOp <name="model.layers.10.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%8514:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=423)]) -> (%8515:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), )] (%8515:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)], %8505:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%8516:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), )] (%8516:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)]) -> (%8517:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), )] (%8517:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)]) -> (%8517:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=429))] (%8517:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=428)]) -> (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)])
-            cf.ReturnOp (%8518:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=430)], %8499:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=418)], %8501:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=420)]) -> ()
+        (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)]) {
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=418, solved=0))] (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)]) -> (%8790:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=420, solved=0))] (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)]) -> (%8791:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=422, solved=0))] (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)]) -> (%8792:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), )] (%8790:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)]) -> (%8790:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), )] (%8790:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)]) -> (%8793:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), )] (%8791:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)]) -> (%8791:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), )] (%8791:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)]) -> (%8794:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), )] (%8792:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)]) -> (%8792:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), )] (%8792:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)]) -> (%8795:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=425, solved=0))] (%8793:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)]) -> (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=427, solved=0))] (%8794:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)]) -> (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.NegOp <name="model.layers.10.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8798:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8798:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8799:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8799:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8800:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8801:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8801:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8800:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8802:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.NegOp <name="model.layers.10.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8803:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8803:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8804:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8804:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8805:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8806:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8806:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8805:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8807:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=428, solved=0), )] (%8807:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8808:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=428)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=428, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), )] (%8808:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=428)]) -> (%8809:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), )] (%8809:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)]) -> (%8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=430, solved=0), )] (%8795:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)]) -> (%8812:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=430)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=430, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431, solved=0), )] (%8812:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=430)]) -> (%8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), )] (%8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)]) -> (%8815:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), )] (%8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)]) -> (%8816:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)])
+            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), )] (%8815:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8817:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
+            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), )] (%8816:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8818:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)])
+            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), )] (%8802:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8817:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8819:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=433, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), )] (%8819:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)], %8820:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=433), constant:[0.088388346]]) -> (%8821:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)])
+            linalg.CPU.ReduceMinOp <name="model.layers.10.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), )] (%8821:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)]) -> (%8822:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=435, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), )] (%8822:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)], %8823:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=435), constant:[-20]]) -> (%8824:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)])
+            linalg.CPU.EqualOp <name="model.layers.10.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=436, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=437, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8825:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=436), constant:[0]]) -> (%8826:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=437)])
+            linalg.CPU.WhereOp <name="model.layers.10.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=437, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), )] (%8826:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=437)], %8821:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)], %8824:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)]) -> (%8827:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)])
+            linalg.CPU.SoftmaxOp <name="model.layers.10.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438, solved=0), )] (%8827:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)]) -> (%8828:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438)])
+            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), )] (%8828:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438)], %8818:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8829:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), )] (%8829:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)]) -> (%8830:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), )] (%8830:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)]) -> (%8830:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=440, solved=0))] (%8830:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)]) -> (%8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)])
+            cf.ReturnOp (%8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.10.mlp <CPU> [using_qnn:true, symbol:model.layers.10.mlp] {
-        (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) {
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=433))] (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8521:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)])
-            linalg.CPU.SiLUOp <name="model.layers.10.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), )] (%8521:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=434)]) -> (%8522:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)])
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=436))] (%8520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)]) -> (%8523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)])
-            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), )] (%8522:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)], %8523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)]) -> (%8524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)])
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=438))] (%8524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=435)]) -> (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            cf.ReturnOp (%8525:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> ()
+        (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)]) -> (%8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)]) {
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444, solved=0))] (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)]) -> (%8834:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=446, solved=0))] (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)]) -> (%8835:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)])
+            linalg.CPU.SigmoidOp <name="model.layers.10.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448, solved=0), )] (%8835:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)]) -> (%8836:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448)])
+            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), )] (%8835:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)], %8836:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448)]) -> (%8837:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)])
+            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), )] (%8837:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)], %8834:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445)]) -> (%8838:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=449, solved=0))] (%8838:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)]) -> (%8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)])
+            cf.ReturnOp (%8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.11 <CPU> [using_qnn:true, symbol:model.layers.11] {
-        (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.11.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=441))] (%8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)])
-            graph.CallGraphOp @model.layers.11.self_attn (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)])
-            linalg.CPU.AddOp <name="model.layers.11.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), )] (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8526:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%8560:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=466))] (%8560:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)])
-            graph.CallGraphOp @model.layers.11.mlp (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
-            linalg.CPU.AddOp <name="model.layers.11.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), )] (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8560:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)]) -> (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
-            cf.ReturnOp (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) -> ()
+        (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.11.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=452, solved=0))] (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)])
+            graph.CallGraphOp @model.layers.11.self_attn (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)])
+            linalg.CPU.AddOp <name="model.layers.11.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)]) -> (%8884:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=478, solved=0))] (%8884:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)])
+            graph.CallGraphOp @model.layers.11.mlp (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)]) -> (%8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)])
+            linalg.CPU.AddOp <name="model.layers.11.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8884:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)]) -> (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.11.self_attn <CPU> [using_qnn:true, symbol:model.layers.11.self_attn] {
-        (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) {
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.q_proj">(%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)]) -> (%8528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=442))] (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)]) -> (%8529:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444))] (%8527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=440)]) -> (%8530:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), )] (%8528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)]) -> (%8528:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), )] (%8528:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)]) -> (%8531:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), )] (%8529:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)]) -> (%8529:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), )] (%8529:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)]) -> (%8532:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), )] (%8530:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)]) -> (%8530:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), )] (%8530:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)]) -> (%8533:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=446), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=448))] (%8531:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=446)]) -> (%8534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450))] (%8532:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)]) -> (%8535:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)])
-            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), )] (%8534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8536:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)])
-            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), )] (%8535:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8537:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), outputs_0:QuantSpec(Raw(type: Float16), uuid=451), )] (%8537:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)]) -> (%8538:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=451)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=451), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), )] (%8538:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=451)]) -> (%8539:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), )] (%8539:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)]) -> (%8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445), outputs_0:QuantSpec(Raw(type: Float16), uuid=453), )] (%8533:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=445)]) -> (%8541:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=453)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=453), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454), )] (%8541:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=453)]) -> (%8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)])
-            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%8037:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)]) -> (%8543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
-            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%8038:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) -> (%8544:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
-            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%8543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
-            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%8544:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8546:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
-            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), )] (%8536:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)], %8545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8547:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)])
-            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), inputs_1:QuantSpec(Raw(type: Float32), uuid=456), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), )] (%8547:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)], %8548:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=456), constant:[0.088388346]]) -> (%8549:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)])
-            linalg.CPU.ReduceMinOp <name="model.layers.11.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%8549:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)]) -> (%8550:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
-            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), inputs_1:QuantSpec(Raw(type: Int16), uuid=458), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%8550:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)], %8551:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=458), constant:[-20]]) -> (%8552:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
-            linalg.CPU.EqualOp <name="model.layers.11.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=459), outputs_0:QuantSpec(Raw(type: UInt8), uuid=460), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8553:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=459), constant:[0.515625]]) -> (%8554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=460)])
-            linalg.CPU.WhereOp <name="model.layers.11.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=460), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%8554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=460)], %8549:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)], %8552:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%8555:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
-            linalg.CPU.SoftmaxOp <name="model.layers.11.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%8555:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%8556:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
-            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%8556:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)], %8546:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%8557:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%8557:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%8558:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%8558:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%8558:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=463))] (%8558:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)])
-            cf.ReturnOp (%8559:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=464)], %8540:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=452)], %8542:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=454)]) -> ()
+        (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)]) {
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=453, solved=0))] (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)]) -> (%8842:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=455, solved=0))] (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)]) -> (%8843:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=457, solved=0))] (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)]) -> (%8844:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), )] (%8842:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)]) -> (%8842:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), )] (%8842:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)]) -> (%8845:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), )] (%8843:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)]) -> (%8843:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), )] (%8843:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)]) -> (%8846:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), )] (%8844:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)]) -> (%8844:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), )] (%8844:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)]) -> (%8847:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=460, solved=0))] (%8845:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)]) -> (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=462, solved=0))] (%8846:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)]) -> (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.NegOp <name="model.layers.11.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8850:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8850:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8851:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8851:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8852:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8853:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8853:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8852:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8854:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.NegOp <name="model.layers.11.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8855:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8855:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8856:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8856:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8857:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8858:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8858:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8857:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8859:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=463, solved=0), )] (%8859:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8860:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=463)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=463, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), )] (%8860:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=463)]) -> (%8861:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), )] (%8861:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)]) -> (%8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=465, solved=0), )] (%8847:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)]) -> (%8864:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=465)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=465, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466, solved=0), )] (%8864:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=465)]) -> (%8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), )] (%8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)]) -> (%8867:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), )] (%8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)]) -> (%8868:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)])
+            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), )] (%8867:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8869:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
+            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), )] (%8868:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8870:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)])
+            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), )] (%8854:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8869:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8871:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=468, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), )] (%8871:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)], %8872:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=468), constant:[0.088388346]]) -> (%8873:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)])
+            linalg.CPU.ReduceMinOp <name="model.layers.11.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), )] (%8873:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)]) -> (%8874:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=470, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), )] (%8874:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)], %8875:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=470), constant:[-20]]) -> (%8876:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)])
+            linalg.CPU.EqualOp <name="model.layers.11.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=471, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=472, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8877:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=471), constant:[0]]) -> (%8878:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=472)])
+            linalg.CPU.WhereOp <name="model.layers.11.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=472, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), )] (%8878:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=472)], %8873:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)], %8876:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)]) -> (%8879:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)])
+            linalg.CPU.SoftmaxOp <name="model.layers.11.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473, solved=0), )] (%8879:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)]) -> (%8880:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473)])
+            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), )] (%8880:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473)], %8870:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8881:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), )] (%8881:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)]) -> (%8882:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), )] (%8882:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)]) -> (%8882:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475, solved=0))] (%8882:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)]) -> (%8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)])
+            cf.ReturnOp (%8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.11.mlp <CPU> [using_qnn:true, symbol:model.layers.11.mlp] {
-        (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) {
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=467))] (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8562:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)])
-            linalg.CPU.SiLUOp <name="model.layers.11.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%8562:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=468)]) -> (%8563:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=470))] (%8561:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=465)]) -> (%8564:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471)])
-            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%8563:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)], %8564:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=471)]) -> (%8565:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=472))] (%8565:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)]) -> (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
-            cf.ReturnOp (%8566:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> ()
+        (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)]) -> (%8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)]) {
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=479, solved=0))] (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)]) -> (%8886:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=481, solved=0))] (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)]) -> (%8887:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)])
+            linalg.CPU.SigmoidOp <name="model.layers.11.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483, solved=0), )] (%8887:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)]) -> (%8888:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483)])
+            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), )] (%8887:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)], %8888:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483)]) -> (%8889:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)])
+            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), )] (%8889:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)], %8886:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480)]) -> (%8890:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=484, solved=0))] (%8890:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)]) -> (%8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)])
+            cf.ReturnOp (%8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.12 <CPU> [using_qnn:true, symbol:model.layers.12] {
-        (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.12.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=475))] (%8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)])
-            graph.CallGraphOp @model.layers.12.self_attn (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)])
-            linalg.CPU.AddOp <name="model.layers.12.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), )] (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8567:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)]) -> (%8601:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=500))] (%8601:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)])
-            graph.CallGraphOp @model.layers.12.mlp (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
-            linalg.CPU.AddOp <name="model.layers.12.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), )] (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8601:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)]) -> (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
-            cf.ReturnOp (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) -> ()
+        (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.12.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=487, solved=0))] (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)])
+            graph.CallGraphOp @model.layers.12.self_attn (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)])
+            linalg.CPU.AddOp <name="model.layers.12.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)]) -> (%8936:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=513, solved=0))] (%8936:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)])
+            graph.CallGraphOp @model.layers.12.mlp (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)]) -> (%8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)])
+            linalg.CPU.AddOp <name="model.layers.12.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8936:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)]) -> (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.12.self_attn <CPU> [using_qnn:true, symbol:model.layers.12.self_attn] {
-        (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) {
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.q_proj">(%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%8569:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=476))] (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%8570:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=478))] (%8568:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%8571:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), )] (%8569:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)]) -> (%8569:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), )] (%8569:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)]) -> (%8572:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), )] (%8570:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%8570:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), )] (%8570:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%8573:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), )] (%8571:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%8571:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), )] (%8571:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%8574:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=480), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=482))] (%8572:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=480)]) -> (%8575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484))] (%8573:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%8576:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)])
-            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), )] (%8575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8577:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)])
-            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), )] (%8576:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8578:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483), outputs_0:QuantSpec(Raw(type: Float16), uuid=485), )] (%8578:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=483)]) -> (%8579:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=485)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=485), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), )] (%8579:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=485)]) -> (%8580:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), )] (%8580:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)]) -> (%8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(Raw(type: Float16), uuid=487), )] (%8574:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%8582:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=487)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=487), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488), )] (%8582:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=487)]) -> (%8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)])
-            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%8039:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)]) -> (%8584:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
-            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%8040:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) -> (%8585:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
-            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%8584:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8586:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
-            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%8585:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8587:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
-            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), )] (%8577:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=481)], %8586:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8588:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
-            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), inputs_1:QuantSpec(Raw(type: Float32), uuid=490), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), )] (%8588:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)], %8589:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=490), constant:[0.088388346]]) -> (%8590:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
-            linalg.CPU.ReduceMinOp <name="model.layers.12.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%8590:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)]) -> (%8591:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
-            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), inputs_1:QuantSpec(Raw(type: Int16), uuid=492), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%8591:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)], %8592:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=492), constant:[-20]]) -> (%8593:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
-            linalg.CPU.EqualOp <name="model.layers.12.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=493), outputs_0:QuantSpec(Raw(type: UInt8), uuid=494), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8594:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=493), constant:[0.74609375]]) -> (%8595:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=494)])
-            linalg.CPU.WhereOp <name="model.layers.12.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=494), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%8595:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=494)], %8590:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)], %8593:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)]) -> (%8596:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
-            linalg.CPU.SoftmaxOp <name="model.layers.12.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), )] (%8596:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)]) -> (%8597:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)])
-            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), )] (%8597:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=495)], %8587:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%8598:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), )] (%8598:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)]) -> (%8599:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), )] (%8599:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)]) -> (%8599:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=497))] (%8599:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=496)]) -> (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)])
-            cf.ReturnOp (%8600:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=498)], %8581:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=486)], %8583:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=488)]) -> ()
+        (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)]) {
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=488, solved=0))] (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)]) -> (%8894:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=490, solved=0))] (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)]) -> (%8895:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=492, solved=0))] (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)]) -> (%8896:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), )] (%8894:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)]) -> (%8894:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), )] (%8894:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)]) -> (%8897:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), )] (%8895:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)]) -> (%8895:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), )] (%8895:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)]) -> (%8898:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), )] (%8896:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)]) -> (%8896:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), )] (%8896:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)]) -> (%8899:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=495, solved=0))] (%8897:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)]) -> (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=497, solved=0))] (%8898:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)]) -> (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.NegOp <name="model.layers.12.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8902:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8902:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8903:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8903:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8904:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8905:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8905:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8904:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8906:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.NegOp <name="model.layers.12.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8907:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8907:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8908:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8908:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8909:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8910:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8910:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8909:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8911:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=498, solved=0), )] (%8911:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8912:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=498)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=498, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), )] (%8912:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=498)]) -> (%8913:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), )] (%8913:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)]) -> (%8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=500, solved=0), )] (%8899:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)]) -> (%8916:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=500)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=500, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501, solved=0), )] (%8916:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=500)]) -> (%8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), )] (%8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)]) -> (%8919:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), )] (%8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)]) -> (%8920:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)])
+            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), )] (%8919:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8921:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
+            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), )] (%8920:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8922:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)])
+            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), )] (%8906:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8921:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8923:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=503, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), )] (%8923:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)], %8924:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=503), constant:[0.088388346]]) -> (%8925:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)])
+            linalg.CPU.ReduceMinOp <name="model.layers.12.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), )] (%8925:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)]) -> (%8926:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=505, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), )] (%8926:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)], %8927:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=505), constant:[-20]]) -> (%8928:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)])
+            linalg.CPU.EqualOp <name="model.layers.12.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=506, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=507, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8929:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=506), constant:[0]]) -> (%8930:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=507)])
+            linalg.CPU.WhereOp <name="model.layers.12.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=507, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), )] (%8930:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=507)], %8925:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)], %8928:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)]) -> (%8931:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)])
+            linalg.CPU.SoftmaxOp <name="model.layers.12.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508, solved=0), )] (%8931:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)]) -> (%8932:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508)])
+            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), )] (%8932:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508)], %8922:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8933:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), )] (%8933:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)]) -> (%8934:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), )] (%8934:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)]) -> (%8934:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510, solved=0))] (%8934:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)]) -> (%8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)])
+            cf.ReturnOp (%8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.12.mlp <CPU> [using_qnn:true, symbol:model.layers.12.mlp] {
-        (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) {
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=501))] (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8603:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)])
-            linalg.CPU.SiLUOp <name="model.layers.12.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), )] (%8603:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=502)]) -> (%8604:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)])
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=504))] (%8602:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%8605:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505)])
-            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), )] (%8604:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)], %8605:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=505)]) -> (%8606:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)])
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=506))] (%8606:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)]) -> (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
-            cf.ReturnOp (%8607:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> ()
+        (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)]) -> (%8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)]) {
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=514, solved=0))] (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)]) -> (%8938:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=516, solved=0))] (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)]) -> (%8939:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)])
+            linalg.CPU.SigmoidOp <name="model.layers.12.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518, solved=0), )] (%8939:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)]) -> (%8940:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518)])
+            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), )] (%8939:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)], %8940:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518)]) -> (%8941:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)])
+            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), )] (%8941:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)], %8938:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515)]) -> (%8942:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=519, solved=0))] (%8942:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)]) -> (%8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)])
+            cf.ReturnOp (%8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.13 <CPU> [using_qnn:true, symbol:model.layers.13] {
-        (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.13.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509))] (%8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)])
-            graph.CallGraphOp @model.layers.13.self_attn (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)])
-            linalg.CPU.AddOp <name="model.layers.13.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), )] (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8608:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%8642:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534))] (%8642:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)])
-            graph.CallGraphOp @model.layers.13.mlp (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
-            linalg.CPU.AddOp <name="model.layers.13.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), )] (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8642:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)]) -> (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
-            cf.ReturnOp (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) -> ()
+        (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.13.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=522, solved=0))] (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)])
+            graph.CallGraphOp @model.layers.13.self_attn (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)])
+            linalg.CPU.AddOp <name="model.layers.13.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)]) -> (%8988:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=548, solved=0))] (%8988:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)])
+            graph.CallGraphOp @model.layers.13.mlp (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)]) -> (%8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)])
+            linalg.CPU.AddOp <name="model.layers.13.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8988:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)]) -> (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.13.self_attn <CPU> [using_qnn:true, symbol:model.layers.13.self_attn] {
-        (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) {
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.q_proj">(%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)]) -> (%8610:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510))] (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)]) -> (%8611:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=512))] (%8609:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=508)]) -> (%8612:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), )] (%8610:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)]) -> (%8610:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), )] (%8610:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)]) -> (%8613:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), )] (%8611:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)]) -> (%8611:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), )] (%8611:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)]) -> (%8614:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), )] (%8612:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)]) -> (%8612:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), )] (%8612:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)]) -> (%8615:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=514), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=516))] (%8613:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=514)]) -> (%8616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=518))] (%8614:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=511)]) -> (%8617:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
-            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), )] (%8616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8618:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)])
-            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), )] (%8617:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8619:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), outputs_0:QuantSpec(Raw(type: Float16), uuid=519), )] (%8619:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)]) -> (%8620:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=519)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=519), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), )] (%8620:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=519)]) -> (%8621:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), )] (%8621:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)]) -> (%8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513), outputs_0:QuantSpec(Raw(type: Float16), uuid=521), )] (%8615:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=513)]) -> (%8623:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=521)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=521), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522), )] (%8623:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=521)]) -> (%8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)])
-            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%8041:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)]) -> (%8625:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
-            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%8042:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) -> (%8626:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
-            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%8625:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8627:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
-            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%8626:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8628:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
-            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), )] (%8618:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)], %8627:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8629:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)])
-            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), inputs_1:QuantSpec(Raw(type: Float32), uuid=524), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), )] (%8629:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)], %8630:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=524), constant:[0.088388346]]) -> (%8631:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)])
-            linalg.CPU.ReduceMinOp <name="model.layers.13.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), )] (%8631:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)]) -> (%8632:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)])
-            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), inputs_1:QuantSpec(Raw(type: Int16), uuid=526), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), )] (%8632:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)], %8633:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=526), constant:[-20]]) -> (%8634:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)])
-            linalg.CPU.EqualOp <name="model.layers.13.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=527), outputs_0:QuantSpec(Raw(type: UInt8), uuid=528), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8635:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=527), constant:[-0.78515625]]) -> (%8636:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=528)])
-            linalg.CPU.WhereOp <name="model.layers.13.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=528), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), )] (%8636:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=528)], %8631:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=523)], %8634:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)]) -> (%8637:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)])
-            linalg.CPU.SoftmaxOp <name="model.layers.13.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%8637:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=525)]) -> (%8638:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), )] (%8638:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)], %8628:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%8639:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), )] (%8639:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)]) -> (%8640:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), )] (%8640:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)]) -> (%8640:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=531))] (%8640:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=530)]) -> (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)])
-            cf.ReturnOp (%8641:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=532)], %8622:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=520)], %8624:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=522)]) -> ()
+        (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)]) {
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=523, solved=0))] (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)]) -> (%8946:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=525, solved=0))] (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)]) -> (%8947:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=527, solved=0))] (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)]) -> (%8948:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), )] (%8946:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)]) -> (%8946:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), )] (%8946:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)]) -> (%8949:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), )] (%8947:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)]) -> (%8947:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), )] (%8947:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)]) -> (%8950:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), )] (%8948:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)]) -> (%8948:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), )] (%8948:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)]) -> (%8951:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=530, solved=0))] (%8949:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)]) -> (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=532, solved=0))] (%8950:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)]) -> (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.NegOp <name="model.layers.13.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8954:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8954:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8955:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8955:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8956:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8957:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8957:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8956:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8958:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.NegOp <name="model.layers.13.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8959:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8959:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8960:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8960:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8961:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8962:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8962:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8961:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8963:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=533, solved=0), )] (%8963:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8964:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=533)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=533, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), )] (%8964:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=533)]) -> (%8965:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), )] (%8965:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)]) -> (%8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=535, solved=0), )] (%8951:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)]) -> (%8968:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=535)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=535, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536, solved=0), )] (%8968:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=535)]) -> (%8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), )] (%8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)]) -> (%8971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), )] (%8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)]) -> (%8972:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)])
+            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), )] (%8971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
+            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), )] (%8972:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8974:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)])
+            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), )] (%8958:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8975:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=538, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), )] (%8975:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)], %8976:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=538), constant:[0.088388346]]) -> (%8977:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)])
+            linalg.CPU.ReduceMinOp <name="model.layers.13.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), )] (%8977:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)]) -> (%8978:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=540, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), )] (%8978:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)], %8979:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=540), constant:[-20]]) -> (%8980:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)])
+            linalg.CPU.EqualOp <name="model.layers.13.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=541, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=542, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8981:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=541), constant:[0]]) -> (%8982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=542)])
+            linalg.CPU.WhereOp <name="model.layers.13.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=542, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), )] (%8982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=542)], %8977:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)], %8980:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)]) -> (%8983:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)])
+            linalg.CPU.SoftmaxOp <name="model.layers.13.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543, solved=0), )] (%8983:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)]) -> (%8984:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543)])
+            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), )] (%8984:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543)], %8974:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8985:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), )] (%8985:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)]) -> (%8986:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), )] (%8986:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)]) -> (%8986:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=545, solved=0))] (%8986:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)]) -> (%8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)])
+            cf.ReturnOp (%8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.13.mlp <CPU> [using_qnn:true, symbol:model.layers.13.mlp] {
-        (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) {
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=535))] (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8644:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)])
-            linalg.CPU.SiLUOp <name="model.layers.13.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), )] (%8644:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)]) -> (%8645:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)])
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=538))] (%8643:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)]) -> (%8646:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539)])
-            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), )] (%8645:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)], %8646:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539)]) -> (%8647:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)])
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=540))] (%8647:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)]) -> (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)])
-            cf.ReturnOp (%8648:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> ()
+        (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)]) -> (%8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)]) {
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=549, solved=0))] (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)]) -> (%8990:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=551, solved=0))] (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)]) -> (%8991:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)])
+            linalg.CPU.SigmoidOp <name="model.layers.13.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553, solved=0), )] (%8991:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)]) -> (%8992:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553)])
+            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), )] (%8991:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)], %8992:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553)]) -> (%8993:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)])
+            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), )] (%8993:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)], %8990:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550)]) -> (%8994:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=554, solved=0))] (%8994:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)]) -> (%8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)])
+            cf.ReturnOp (%8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.14 <CPU> [using_qnn:true, symbol:model.layers.14] {
-        (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.14.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=543))] (%8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)])
-            graph.CallGraphOp @model.layers.14.self_attn (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
-            linalg.CPU.AddOp <name="model.layers.14.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), )] (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8649:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=541)]) -> (%8683:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=568))] (%8683:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)])
-            graph.CallGraphOp @model.layers.14.mlp (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
-            linalg.CPU.AddOp <name="model.layers.14.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), )] (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8683:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
-            cf.ReturnOp (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> ()
+        (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.14.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=557, solved=0))] (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)])
+            graph.CallGraphOp @model.layers.14.self_attn (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)])
+            linalg.CPU.AddOp <name="model.layers.14.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)]) -> (%9040:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=583, solved=0))] (%9040:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)])
+            graph.CallGraphOp @model.layers.14.mlp (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)]) -> (%9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)])
+            linalg.CPU.AddOp <name="model.layers.14.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9040:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)]) -> (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.14.self_attn <CPU> [using_qnn:true, symbol:model.layers.14.self_attn] {
-        (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) {
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.q_proj">(%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%8651:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=544))] (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%8652:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=546))] (%8650:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%8653:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), )] (%8651:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)]) -> (%8651:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), )] (%8651:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)]) -> (%8654:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), )] (%8652:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%8652:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), )] (%8652:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%8655:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), )] (%8653:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%8653:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), )] (%8653:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%8656:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=548), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=550))] (%8654:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=548)]) -> (%8657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552))] (%8655:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%8658:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)])
-            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), )] (%8657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8659:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
-            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), )] (%8658:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8660:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), outputs_0:QuantSpec(Raw(type: Float16), uuid=553), )] (%8660:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)]) -> (%8661:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=553)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=553), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), )] (%8661:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=553)]) -> (%8662:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), )] (%8662:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)]) -> (%8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(Raw(type: Float16), uuid=555), )] (%8656:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%8664:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=555)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=555), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556), )] (%8664:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=555)]) -> (%8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
-            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%8043:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)]) -> (%8666:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
-            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%8044:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> (%8667:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
-            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%8666:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%8668:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
-            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%8667:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8669:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
-            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), )] (%8659:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)], %8668:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%8670:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)])
-            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), inputs_1:QuantSpec(Raw(type: Float32), uuid=558), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), )] (%8670:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)], %8671:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=558), constant:[0.088388346]]) -> (%8672:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)])
-            linalg.CPU.ReduceMinOp <name="model.layers.14.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%8672:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)]) -> (%8673:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
-            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), inputs_1:QuantSpec(Raw(type: Int16), uuid=560), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%8673:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)], %8674:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=560), constant:[-20]]) -> (%8675:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
-            linalg.CPU.EqualOp <name="model.layers.14.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=561), outputs_0:QuantSpec(Raw(type: UInt8), uuid=562), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8676:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=561), constant:[-0.46289062]]) -> (%8677:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=562)])
-            linalg.CPU.WhereOp <name="model.layers.14.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=562), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%8677:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=562)], %8672:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)], %8675:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)]) -> (%8678:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
-            linalg.CPU.SoftmaxOp <name="model.layers.14.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), )] (%8678:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)]) -> (%8679:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)])
-            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%8679:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)], %8669:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%8680:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%8680:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%8681:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%8681:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%8681:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=565))] (%8681:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
-            cf.ReturnOp (%8682:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %8663:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %8665:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> ()
+        (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)]) {
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=558, solved=0))] (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)]) -> (%8998:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=560, solved=0))] (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)]) -> (%8999:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=562, solved=0))] (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)]) -> (%9000:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), )] (%8998:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)]) -> (%8998:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), )] (%8998:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)]) -> (%9001:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), )] (%8999:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)]) -> (%8999:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), )] (%8999:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)]) -> (%9002:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), )] (%9000:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)]) -> (%9000:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), )] (%9000:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)]) -> (%9003:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=565, solved=0))] (%9001:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)]) -> (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=567, solved=0))] (%9002:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)]) -> (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.NegOp <name="model.layers.14.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9006:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9006:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9007:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9007:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9008:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9009:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9009:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %9008:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9010:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.NegOp <name="model.layers.14.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9011:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9011:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9012:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9012:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9013:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9014:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9014:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %9013:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9015:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=568, solved=0), )] (%9015:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9016:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=568)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=568, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), )] (%9016:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=568)]) -> (%9017:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), )] (%9017:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)]) -> (%9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=570, solved=0), )] (%9003:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)]) -> (%9020:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=570)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=570, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571, solved=0), )] (%9020:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=570)]) -> (%9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), )] (%8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)]) -> (%9023:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), )] (%8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)]) -> (%9024:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)])
+            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), )] (%9023:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%9025:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
+            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), )] (%9024:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9026:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)])
+            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), )] (%9010:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %9025:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%9027:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=573, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), )] (%9027:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)], %9028:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=573), constant:[0.088388346]]) -> (%9029:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)])
+            linalg.CPU.ReduceMinOp <name="model.layers.14.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), )] (%9029:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)]) -> (%9030:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=575, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), )] (%9030:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)], %9031:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=575), constant:[-20]]) -> (%9032:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)])
+            linalg.CPU.EqualOp <name="model.layers.14.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=576, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=577, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9033:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=576), constant:[0]]) -> (%9034:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=577)])
+            linalg.CPU.WhereOp <name="model.layers.14.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=577, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), )] (%9034:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=577)], %9029:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)], %9032:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)]) -> (%9035:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)])
+            linalg.CPU.SoftmaxOp <name="model.layers.14.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578, solved=0), )] (%9035:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)]) -> (%9036:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578)])
+            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), )] (%9036:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578)], %9026:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9037:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), )] (%9037:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)]) -> (%9038:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), )] (%9038:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)]) -> (%9038:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580, solved=0))] (%9038:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)]) -> (%9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)])
+            cf.ReturnOp (%9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.14.mlp <CPU> [using_qnn:true, symbol:model.layers.14.mlp] {
-        (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) {
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=569))] (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8685:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)])
-            linalg.CPU.SiLUOp <name="model.layers.14.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), )] (%8685:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)]) -> (%8686:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)])
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=572))] (%8684:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%8687:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573)])
-            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), )] (%8686:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)], %8687:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=573)]) -> (%8688:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)])
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=574))] (%8688:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=571)]) -> (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
-            cf.ReturnOp (%8689:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> ()
+        (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)]) -> (%9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)]) {
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=584, solved=0))] (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)]) -> (%9042:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=586, solved=0))] (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)]) -> (%9043:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)])
+            linalg.CPU.SigmoidOp <name="model.layers.14.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588, solved=0), )] (%9043:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)]) -> (%9044:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588)])
+            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), )] (%9043:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)], %9044:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588)]) -> (%9045:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)])
+            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), )] (%9045:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)], %9042:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585)]) -> (%9046:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=589, solved=0))] (%9046:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)]) -> (%9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)])
+            cf.ReturnOp (%9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.15 <CPU> [using_qnn:true, symbol:model.layers.15] {
-        (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.15.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577))] (%8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)])
-            graph.CallGraphOp @model.layers.15.self_attn (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)])
-            linalg.CPU.AddOp <name="model.layers.15.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), )] (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8690:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%8724:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602))] (%8724:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)])
-            graph.CallGraphOp @model.layers.15.mlp (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            linalg.CPU.AddOp <name="model.layers.15.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), )] (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8724:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            cf.ReturnOp (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) -> ()
+        (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.15.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=592, solved=0))] (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)])
+            graph.CallGraphOp @model.layers.15.self_attn (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)])
+            linalg.CPU.AddOp <name="model.layers.15.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)]) -> (%9092:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=618, solved=0))] (%9092:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)])
+            graph.CallGraphOp @model.layers.15.mlp (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)]) -> (%9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)])
+            linalg.CPU.AddOp <name="model.layers.15.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9092:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)]) -> (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.15.self_attn <CPU> [using_qnn:true, symbol:model.layers.15.self_attn] {
-        (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) {
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.q_proj">(%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)]) -> (%8692:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=578))] (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)]) -> (%8693:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580))] (%8691:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=576)]) -> (%8694:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), )] (%8692:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)]) -> (%8692:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), )] (%8692:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)]) -> (%8695:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), )] (%8693:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%8693:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), )] (%8693:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%8696:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), )] (%8694:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)]) -> (%8694:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), )] (%8694:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)]) -> (%8697:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=582), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=584))] (%8695:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=582)]) -> (%8698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=586))] (%8696:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%8699:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)])
-            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), )] (%8698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8700:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)])
-            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), )] (%8699:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8701:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585), outputs_0:QuantSpec(Raw(type: Float16), uuid=587), )] (%8701:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=585)]) -> (%8702:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=587)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=587), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), )] (%8702:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=587)]) -> (%8703:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), )] (%8703:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)]) -> (%8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), outputs_0:QuantSpec(Raw(type: Float16), uuid=589), )] (%8697:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)]) -> (%8705:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=589)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=589), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590), )] (%8705:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=589)]) -> (%8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)])
-            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%8045:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)]) -> (%8707:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
-            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%8046:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) -> (%8708:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
-            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%8707:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%8709:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
-            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%8708:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8710:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
-            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), )] (%8700:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=583)], %8709:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%8711:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)])
-            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), inputs_1:QuantSpec(Raw(type: Float32), uuid=592), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), )] (%8711:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)], %8712:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=592), constant:[0.088388346]]) -> (%8713:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)])
-            linalg.CPU.ReduceMinOp <name="model.layers.15.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%8713:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)]) -> (%8714:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
-            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), inputs_1:QuantSpec(Raw(type: Int16), uuid=594), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%8714:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)], %8715:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=594), constant:[-20]]) -> (%8716:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
-            linalg.CPU.EqualOp <name="model.layers.15.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=595), outputs_0:QuantSpec(Raw(type: UInt8), uuid=596), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8717:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=595), constant:[0.953125]]) -> (%8718:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=596)])
-            linalg.CPU.WhereOp <name="model.layers.15.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=596), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%8718:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=596)], %8713:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=591)], %8716:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)]) -> (%8719:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
-            linalg.CPU.SoftmaxOp <name="model.layers.15.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), )] (%8719:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)]) -> (%8720:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)])
-            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), )] (%8720:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)], %8710:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%8721:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), )] (%8721:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)]) -> (%8722:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), )] (%8722:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)]) -> (%8722:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=599))] (%8722:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=598)]) -> (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
-            cf.ReturnOp (%8723:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %8704:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=588)], %8706:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=590)]) -> ()
+        (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)]) {
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=593, solved=0))] (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)]) -> (%9050:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=595, solved=0))] (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)]) -> (%9051:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=597, solved=0))] (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)]) -> (%9052:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), )] (%9050:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)]) -> (%9050:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), )] (%9050:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)]) -> (%9053:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), )] (%9051:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)]) -> (%9051:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), )] (%9051:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)]) -> (%9054:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), )] (%9052:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)]) -> (%9052:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), )] (%9052:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)]) -> (%9055:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=600, solved=0))] (%9053:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)]) -> (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=602, solved=0))] (%9054:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)]) -> (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.NegOp <name="model.layers.15.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9058:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9058:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9059:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9059:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9060:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9061:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9061:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %9060:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9062:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.NegOp <name="model.layers.15.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9063:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9063:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9064:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9064:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9065:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9066:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9066:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %9065:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9067:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=603, solved=0), )] (%9067:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9068:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=603)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=603, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), )] (%9068:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=603)]) -> (%9069:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), )] (%9069:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)]) -> (%9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=605, solved=0), )] (%9055:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)]) -> (%9072:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=605)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=605, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606, solved=0), )] (%9072:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=605)]) -> (%9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), )] (%8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)]) -> (%9075:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), )] (%8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)]) -> (%9076:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)])
+            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), )] (%9075:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%9077:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
+            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), )] (%9076:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9078:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)])
+            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), )] (%9062:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %9077:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%9079:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=608, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), )] (%9079:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)], %9080:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=608), constant:[0.088388346]]) -> (%9081:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)])
+            linalg.CPU.ReduceMinOp <name="model.layers.15.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), )] (%9081:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)]) -> (%9082:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=610, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), )] (%9082:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)], %9083:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=610), constant:[-20]]) -> (%9084:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)])
+            linalg.CPU.EqualOp <name="model.layers.15.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=611, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=612, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9085:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=611), constant:[0]]) -> (%9086:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=612)])
+            linalg.CPU.WhereOp <name="model.layers.15.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=612, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), )] (%9086:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=612)], %9081:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)], %9084:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)]) -> (%9087:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)])
+            linalg.CPU.SoftmaxOp <name="model.layers.15.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613, solved=0), )] (%9087:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)]) -> (%9088:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613)])
+            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), )] (%9088:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613)], %9078:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9089:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), )] (%9089:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)]) -> (%9090:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), )] (%9090:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)]) -> (%9090:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=615, solved=0))] (%9090:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)]) -> (%9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)])
+            cf.ReturnOp (%9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.15.mlp <CPU> [using_qnn:true, symbol:model.layers.15.mlp] {
-        (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) {
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=603))] (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8726:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
-            linalg.CPU.SiLUOp <name="model.layers.15.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), )] (%8726:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) -> (%8727:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)])
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=606))] (%8725:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=601)]) -> (%8728:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)])
-            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), )] (%8727:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)], %8728:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)]) -> (%8729:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)])
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=608))] (%8729:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)]) -> (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            cf.ReturnOp (%8730:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> ()
+        (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)]) -> (%9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)]) {
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=619, solved=0))] (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)]) -> (%9094:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=621, solved=0))] (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)]) -> (%9095:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)])
+            linalg.CPU.SigmoidOp <name="model.layers.15.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623, solved=0), )] (%9095:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)]) -> (%9096:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623)])
+            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), )] (%9095:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)], %9096:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623)]) -> (%9097:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)])
+            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), )] (%9097:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)], %9094:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620)]) -> (%9098:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=624, solved=0))] (%9098:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)]) -> (%9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)])
+            cf.ReturnOp (%9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.16 <CPU> [using_qnn:true, symbol:model.layers.16] {
-        (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.16.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611))] (%8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)])
-            graph.CallGraphOp @model.layers.16.self_attn (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)])
-            linalg.CPU.AddOp <name="model.layers.16.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), )] (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8731:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%8765:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=636))] (%8765:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)])
-            graph.CallGraphOp @model.layers.16.mlp (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
-            linalg.CPU.AddOp <name="model.layers.16.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), )] (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8765:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
-            cf.ReturnOp (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) -> ()
+        (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.16.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=627, solved=0))] (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)])
+            graph.CallGraphOp @model.layers.16.self_attn (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)])
+            linalg.CPU.AddOp <name="model.layers.16.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)]) -> (%9144:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=653, solved=0))] (%9144:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)])
+            graph.CallGraphOp @model.layers.16.mlp (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)]) -> (%9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)])
+            linalg.CPU.AddOp <name="model.layers.16.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9144:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)]) -> (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.16.self_attn <CPU> [using_qnn:true, symbol:model.layers.16.self_attn] {
-        (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) {
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.q_proj">(%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)]) -> (%8733:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=612))] (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)]) -> (%8734:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=614))] (%8732:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=610)]) -> (%8735:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), )] (%8733:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)]) -> (%8733:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), )] (%8733:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)]) -> (%8736:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), )] (%8734:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)]) -> (%8734:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), )] (%8734:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)]) -> (%8737:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), )] (%8735:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)]) -> (%8735:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), )] (%8735:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)]) -> (%8738:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=616), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=618))] (%8736:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=616)]) -> (%8739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=620))] (%8737:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=613)]) -> (%8740:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
-            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), )] (%8739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8741:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)])
-            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), )] (%8740:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8742:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), outputs_0:QuantSpec(Raw(type: Float16), uuid=621), )] (%8742:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)]) -> (%8743:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=621)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=621), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), )] (%8743:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=621)]) -> (%8744:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), )] (%8744:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)]) -> (%8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615), outputs_0:QuantSpec(Raw(type: Float16), uuid=623), )] (%8738:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=615)]) -> (%8746:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=623)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=623), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624), )] (%8746:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=623)]) -> (%8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)])
-            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%8047:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)]) -> (%8748:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
-            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%8048:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) -> (%8749:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
-            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%8748:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%8750:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
-            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%8749:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8751:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
-            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), )] (%8741:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)], %8750:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%8752:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)])
-            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), inputs_1:QuantSpec(Raw(type: Float32), uuid=626), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), )] (%8752:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)], %8753:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=626), constant:[0.088388346]]) -> (%8754:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)])
-            linalg.CPU.ReduceMinOp <name="model.layers.16.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%8754:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)]) -> (%8755:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
-            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), inputs_1:QuantSpec(Raw(type: Int16), uuid=628), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%8755:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)], %8756:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=628), constant:[-20]]) -> (%8757:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
-            linalg.CPU.EqualOp <name="model.layers.16.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=629), outputs_0:QuantSpec(Raw(type: UInt8), uuid=630), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8758:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=629), constant:[0.118652344]]) -> (%8759:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=630)])
-            linalg.CPU.WhereOp <name="model.layers.16.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=630), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%8759:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=630)], %8754:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=625)], %8757:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%8760:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
-            linalg.CPU.SoftmaxOp <name="model.layers.16.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), )] (%8760:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%8761:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)])
-            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), )] (%8761:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=631)], %8751:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%8762:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), )] (%8762:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%8763:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), )] (%8763:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%8763:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=633))] (%8763:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
-            cf.ReturnOp (%8764:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %8745:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=622)], %8747:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=624)]) -> ()
+        (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)]) {
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=628, solved=0))] (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)]) -> (%9102:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=630, solved=0))] (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)]) -> (%9103:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=632, solved=0))] (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)]) -> (%9104:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), )] (%9102:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)]) -> (%9102:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), )] (%9102:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)]) -> (%9105:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), )] (%9103:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)]) -> (%9103:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), )] (%9103:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)]) -> (%9106:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), )] (%9104:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)]) -> (%9104:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), )] (%9104:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)]) -> (%9107:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=635, solved=0))] (%9105:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)]) -> (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=637, solved=0))] (%9106:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)]) -> (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.NegOp <name="model.layers.16.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9110:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9110:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9111:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9111:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9112:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9113:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9113:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %9112:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9114:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.NegOp <name="model.layers.16.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9115:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9115:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9116:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9116:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9117:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9118:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9118:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %9117:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9119:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=638, solved=0), )] (%9119:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9120:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=638)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=638, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), )] (%9120:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=638)]) -> (%9121:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), )] (%9121:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)]) -> (%9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=640, solved=0), )] (%9107:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)]) -> (%9124:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=640)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=640, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641, solved=0), )] (%9124:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=640)]) -> (%9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), )] (%8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)]) -> (%9127:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), )] (%8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)]) -> (%9128:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)])
+            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), )] (%9127:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%9129:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
+            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), )] (%9128:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9130:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)])
+            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), )] (%9114:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %9129:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%9131:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=643, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), )] (%9131:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)], %9132:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=643), constant:[0.088388346]]) -> (%9133:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)])
+            linalg.CPU.ReduceMinOp <name="model.layers.16.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), )] (%9133:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)]) -> (%9134:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=645, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), )] (%9134:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)], %9135:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=645), constant:[-20]]) -> (%9136:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)])
+            linalg.CPU.EqualOp <name="model.layers.16.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=646, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=647, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9137:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=646), constant:[0]]) -> (%9138:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=647)])
+            linalg.CPU.WhereOp <name="model.layers.16.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=647, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), )] (%9138:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=647)], %9133:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)], %9136:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)]) -> (%9139:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)])
+            linalg.CPU.SoftmaxOp <name="model.layers.16.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648, solved=0), )] (%9139:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)]) -> (%9140:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648)])
+            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), )] (%9140:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648)], %9130:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9141:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), )] (%9141:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)]) -> (%9142:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), )] (%9142:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)]) -> (%9142:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=650, solved=0))] (%9142:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)]) -> (%9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)])
+            cf.ReturnOp (%9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.16.mlp <CPU> [using_qnn:true, symbol:model.layers.16.mlp] {
-        (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) {
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=637))] (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8767:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)])
-            linalg.CPU.SiLUOp <name="model.layers.16.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), )] (%8767:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=638)]) -> (%8768:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=640))] (%8766:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%8769:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)])
-            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), )] (%8768:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)], %8769:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)]) -> (%8770:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=642))] (%8770:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)]) -> (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)])
-            cf.ReturnOp (%8771:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> ()
+        (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)]) -> (%9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)]) {
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=654, solved=0))] (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)]) -> (%9146:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=656, solved=0))] (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)]) -> (%9147:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)])
+            linalg.CPU.SigmoidOp <name="model.layers.16.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658, solved=0), )] (%9147:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)]) -> (%9148:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658)])
+            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), )] (%9147:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)], %9148:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658)]) -> (%9149:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)])
+            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), )] (%9149:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)], %9146:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655)]) -> (%9150:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=659, solved=0))] (%9150:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)]) -> (%9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)])
+            cf.ReturnOp (%9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.17 <CPU> [using_qnn:true, symbol:model.layers.17] {
-        (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.17.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=645))] (%8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)])
-            graph.CallGraphOp @model.layers.17.self_attn (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)])
-            linalg.CPU.AddOp <name="model.layers.17.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), )] (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8772:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=643)]) -> (%8806:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=670))] (%8806:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)])
-            graph.CallGraphOp @model.layers.17.mlp (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
-            linalg.CPU.AddOp <name="model.layers.17.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), )] (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8806:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)]) -> (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
-            cf.ReturnOp (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) -> ()
+        (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.17.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=662, solved=0))] (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)])
+            graph.CallGraphOp @model.layers.17.self_attn (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)])
+            linalg.CPU.AddOp <name="model.layers.17.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)]) -> (%9196:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=688, solved=0))] (%9196:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)])
+            graph.CallGraphOp @model.layers.17.mlp (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)]) -> (%9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)])
+            linalg.CPU.AddOp <name="model.layers.17.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9196:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)]) -> (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.17.self_attn <CPU> [using_qnn:true, symbol:model.layers.17.self_attn] {
-        (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) {
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.q_proj">(%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)]) -> (%8774:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=646))] (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)]) -> (%8775:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=648))] (%8773:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=644)]) -> (%8776:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), )] (%8774:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)]) -> (%8774:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), )] (%8774:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)]) -> (%8777:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), )] (%8775:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%8775:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), )] (%8775:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%8778:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%8776:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%8776:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%8776:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%8779:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=650), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=652))] (%8777:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=650)]) -> (%8780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654))] (%8778:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%8781:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)])
-            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), )] (%8780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8782:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)])
-            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), )] (%8781:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8783:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), outputs_0:QuantSpec(Raw(type: Float16), uuid=655), )] (%8783:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)]) -> (%8784:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=655)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=655), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), )] (%8784:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=655)]) -> (%8785:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), )] (%8785:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)]) -> (%8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(Raw(type: Float16), uuid=657), )] (%8779:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%8787:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=657)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=657), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658), )] (%8787:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=657)]) -> (%8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)])
-            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%8049:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)]) -> (%8789:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
-            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%8050:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) -> (%8790:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
-            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%8789:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%8791:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
-            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%8790:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8792:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
-            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), )] (%8782:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=651)], %8791:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%8793:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)])
-            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), inputs_1:QuantSpec(Raw(type: Float32), uuid=660), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), )] (%8793:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)], %8794:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=660), constant:[0.088388346]]) -> (%8795:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)])
-            linalg.CPU.ReduceMinOp <name="model.layers.17.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), )] (%8795:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)]) -> (%8796:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)])
-            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), inputs_1:QuantSpec(Raw(type: Int16), uuid=662), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), )] (%8796:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)], %8797:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=662), constant:[-20]]) -> (%8798:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)])
-            linalg.CPU.EqualOp <name="model.layers.17.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=663), outputs_0:QuantSpec(Raw(type: UInt8), uuid=664), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8799:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=663), constant:[-0.99609375]]) -> (%8800:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=664)])
-            linalg.CPU.WhereOp <name="model.layers.17.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=664), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), )] (%8800:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=664)], %8795:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)], %8798:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)]) -> (%8801:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)])
-            linalg.CPU.SoftmaxOp <name="model.layers.17.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), )] (%8801:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=661)]) -> (%8802:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)])
-            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), )] (%8802:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)], %8792:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%8803:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), )] (%8803:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)]) -> (%8804:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), )] (%8804:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)]) -> (%8804:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667))] (%8804:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=666)]) -> (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)])
-            cf.ReturnOp (%8805:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=668)], %8786:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=656)], %8788:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=658)]) -> ()
+        (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)]) {
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=663, solved=0))] (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)]) -> (%9154:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=665, solved=0))] (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)]) -> (%9155:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667, solved=0))] (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)]) -> (%9156:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), )] (%9154:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)]) -> (%9154:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), )] (%9154:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)]) -> (%9157:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), )] (%9155:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)]) -> (%9155:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), )] (%9155:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)]) -> (%9158:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), )] (%9156:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)]) -> (%9156:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), )] (%9156:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)]) -> (%9159:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=670, solved=0))] (%9157:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)]) -> (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=672, solved=0))] (%9158:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)]) -> (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.NegOp <name="model.layers.17.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9162:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9162:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9163:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9163:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9164:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9165:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9165:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %9164:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9166:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.NegOp <name="model.layers.17.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9167:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9167:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9168:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9168:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9169:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9170:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9170:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %9169:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9171:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=673, solved=0), )] (%9171:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=673)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=673, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), )] (%9172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=673)]) -> (%9173:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), )] (%9173:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)]) -> (%9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=675, solved=0), )] (%9159:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)]) -> (%9176:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=675)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=675, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676, solved=0), )] (%9176:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=675)]) -> (%9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), )] (%8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)]) -> (%9179:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), )] (%8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)]) -> (%9180:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)])
+            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), )] (%9179:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%9181:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
+            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), )] (%9180:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9182:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)])
+            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), )] (%9166:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %9181:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%9183:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=678, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), )] (%9183:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)], %9184:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=678), constant:[0.088388346]]) -> (%9185:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)])
+            linalg.CPU.ReduceMinOp <name="model.layers.17.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), )] (%9185:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)]) -> (%9186:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=680, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), )] (%9186:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)], %9187:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=680), constant:[-20]]) -> (%9188:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)])
+            linalg.CPU.EqualOp <name="model.layers.17.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=681, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=682, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9189:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=681), constant:[0]]) -> (%9190:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=682)])
+            linalg.CPU.WhereOp <name="model.layers.17.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=682, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), )] (%9190:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=682)], %9185:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)], %9188:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)]) -> (%9191:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)])
+            linalg.CPU.SoftmaxOp <name="model.layers.17.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683, solved=0), )] (%9191:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)]) -> (%9192:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683)])
+            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), )] (%9192:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683)], %9182:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9193:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), )] (%9193:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)]) -> (%9194:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), )] (%9194:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)]) -> (%9194:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=685, solved=0))] (%9194:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)]) -> (%9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)])
+            cf.ReturnOp (%9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.17.mlp <CPU> [using_qnn:true, symbol:model.layers.17.mlp] {
-        (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) {
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=671))] (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8808:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)])
-            linalg.CPU.SiLUOp <name="model.layers.17.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), )] (%8808:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)]) -> (%8809:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)])
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=674))] (%8807:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%8810:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675)])
-            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), )] (%8809:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)], %8810:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=675)]) -> (%8811:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)])
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=676))] (%8811:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=673)]) -> (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
-            cf.ReturnOp (%8812:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> ()
+        (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)]) -> (%9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)]) {
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=689, solved=0))] (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)]) -> (%9198:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=691, solved=0))] (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)]) -> (%9199:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)])
+            linalg.CPU.SigmoidOp <name="model.layers.17.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693, solved=0), )] (%9199:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)]) -> (%9200:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693)])
+            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), )] (%9199:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)], %9200:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693)]) -> (%9201:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)])
+            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), )] (%9201:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)], %9198:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690)]) -> (%9202:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=694, solved=0))] (%9202:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)]) -> (%9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)])
+            cf.ReturnOp (%9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.18 <CPU> [using_qnn:true, symbol:model.layers.18] {
-        (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.18.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679))] (%8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)])
-            graph.CallGraphOp @model.layers.18.self_attn (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)])
-            linalg.CPU.AddOp <name="model.layers.18.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), )] (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8813:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%8847:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=704))] (%8847:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)])
-            graph.CallGraphOp @model.layers.18.mlp (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
-            linalg.CPU.AddOp <name="model.layers.18.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), )] (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8847:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
-            cf.ReturnOp (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) -> ()
+        (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.18.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=697, solved=0))] (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)])
+            graph.CallGraphOp @model.layers.18.self_attn (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)])
+            linalg.CPU.AddOp <name="model.layers.18.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)]) -> (%9248:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=723, solved=0))] (%9248:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)])
+            graph.CallGraphOp @model.layers.18.mlp (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)]) -> (%9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)])
+            linalg.CPU.AddOp <name="model.layers.18.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9248:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)]) -> (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.18.self_attn <CPU> [using_qnn:true, symbol:model.layers.18.self_attn] {
-        (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) {
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.q_proj">(%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)]) -> (%8815:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=680))] (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)]) -> (%8816:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=682))] (%8814:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=678)]) -> (%8817:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), )] (%8815:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)]) -> (%8815:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), )] (%8815:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)]) -> (%8818:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), )] (%8816:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)]) -> (%8816:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), )] (%8816:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)]) -> (%8819:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), )] (%8817:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)]) -> (%8817:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), )] (%8817:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)]) -> (%8820:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=684), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=686))] (%8818:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=684)]) -> (%8821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=688))] (%8819:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=681)]) -> (%8822:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)])
-            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), )] (%8821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8823:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)])
-            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), )] (%8822:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8824:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), outputs_0:QuantSpec(Raw(type: Float16), uuid=689), )] (%8824:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)]) -> (%8825:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=689)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=689), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), )] (%8825:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=689)]) -> (%8826:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), )] (%8826:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)]) -> (%8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), outputs_0:QuantSpec(Raw(type: Float16), uuid=691), )] (%8820:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)]) -> (%8828:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=691)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=691), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692), )] (%8828:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=691)]) -> (%8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)])
-            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%8051:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)]) -> (%8830:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
-            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%8052:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) -> (%8831:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
-            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%8830:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%8832:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
-            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%8831:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8833:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
-            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), )] (%8823:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=685)], %8832:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%8834:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)])
-            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), inputs_1:QuantSpec(Raw(type: Float32), uuid=694), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), )] (%8834:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)], %8835:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=694), constant:[0.088388346]]) -> (%8836:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)])
-            linalg.CPU.ReduceMinOp <name="model.layers.18.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%8836:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)]) -> (%8837:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
-            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), inputs_1:QuantSpec(Raw(type: Int16), uuid=696), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%8837:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)], %8838:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=696), constant:[-20]]) -> (%8839:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
-            linalg.CPU.EqualOp <name="model.layers.18.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=697), outputs_0:QuantSpec(Raw(type: UInt8), uuid=698), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8840:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=697), constant:[0.24023438]]) -> (%8841:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=698)])
-            linalg.CPU.WhereOp <name="model.layers.18.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=698), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%8841:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=698)], %8836:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=693)], %8839:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%8842:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
-            linalg.CPU.SoftmaxOp <name="model.layers.18.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%8842:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%8843:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), )] (%8843:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)], %8833:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%8844:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), )] (%8844:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)]) -> (%8845:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), )] (%8845:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)]) -> (%8845:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=701))] (%8845:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=700)]) -> (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
-            cf.ReturnOp (%8846:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %8827:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=690)], %8829:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=692)]) -> ()
+        (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)]) {
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=698, solved=0))] (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)]) -> (%9206:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=700, solved=0))] (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)]) -> (%9207:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=702, solved=0))] (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)]) -> (%9208:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), )] (%9206:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)]) -> (%9206:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), )] (%9206:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)]) -> (%9209:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), )] (%9207:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)]) -> (%9207:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), )] (%9207:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)]) -> (%9210:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), )] (%9208:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)]) -> (%9208:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), )] (%9208:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)]) -> (%9211:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=705, solved=0))] (%9209:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)]) -> (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=707, solved=0))] (%9210:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)]) -> (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.NegOp <name="model.layers.18.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9214:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9214:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9215:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9215:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9216:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9217:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9217:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %9216:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9218:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.NegOp <name="model.layers.18.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9219:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9219:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9220:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9220:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9221:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9222:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9222:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %9221:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9223:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=708, solved=0), )] (%9223:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9224:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=708)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=708, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), )] (%9224:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=708)]) -> (%9225:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), )] (%9225:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)]) -> (%9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=710, solved=0), )] (%9211:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)]) -> (%9228:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=710)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=710, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711, solved=0), )] (%9228:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=710)]) -> (%9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), )] (%8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)]) -> (%9231:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), )] (%8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)]) -> (%9232:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)])
+            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), )] (%9231:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%9233:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
+            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), )] (%9232:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9234:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)])
+            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), )] (%9218:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %9233:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%9235:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=713, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), )] (%9235:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)], %9236:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=713), constant:[0.088388346]]) -> (%9237:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)])
+            linalg.CPU.ReduceMinOp <name="model.layers.18.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), )] (%9237:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)]) -> (%9238:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=715, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), )] (%9238:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)], %9239:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=715), constant:[-20]]) -> (%9240:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)])
+            linalg.CPU.EqualOp <name="model.layers.18.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=716, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=717, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9241:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=716), constant:[0]]) -> (%9242:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=717)])
+            linalg.CPU.WhereOp <name="model.layers.18.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=717, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), )] (%9242:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=717)], %9237:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)], %9240:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)]) -> (%9243:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)])
+            linalg.CPU.SoftmaxOp <name="model.layers.18.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718, solved=0), )] (%9243:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)]) -> (%9244:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718)])
+            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), )] (%9244:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718)], %9234:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9245:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), )] (%9245:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)]) -> (%9246:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), )] (%9246:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)]) -> (%9246:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=720, solved=0))] (%9246:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)]) -> (%9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)])
+            cf.ReturnOp (%9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.18.mlp <CPU> [using_qnn:true, symbol:model.layers.18.mlp] {
-        (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) {
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=705))] (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8849:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)])
-            linalg.CPU.SiLUOp <name="model.layers.18.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), )] (%8849:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=706)]) -> (%8850:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)])
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=708))] (%8848:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=703)]) -> (%8851:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)])
-            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), )] (%8850:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)], %8851:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)]) -> (%8852:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)])
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=710))] (%8852:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)]) -> (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)])
-            cf.ReturnOp (%8853:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> ()
+        (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)]) -> (%9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)]) {
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=724, solved=0))] (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)]) -> (%9250:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=726, solved=0))] (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)]) -> (%9251:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)])
+            linalg.CPU.SigmoidOp <name="model.layers.18.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728, solved=0), )] (%9251:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)]) -> (%9252:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728)])
+            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), )] (%9251:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)], %9252:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728)]) -> (%9253:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)])
+            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), )] (%9253:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)], %9250:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725)]) -> (%9254:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=729, solved=0))] (%9254:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)]) -> (%9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)])
+            cf.ReturnOp (%9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.19 <CPU> [using_qnn:true, symbol:model.layers.19] {
-        (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.19.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713))] (%8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)])
-            graph.CallGraphOp @model.layers.19.self_attn (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)])
-            linalg.CPU.AddOp <name="model.layers.19.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), )] (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8854:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=711)]) -> (%8888:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=738))] (%8888:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)])
-            graph.CallGraphOp @model.layers.19.mlp (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
-            linalg.CPU.AddOp <name="model.layers.19.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), )] (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8888:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)]) -> (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
-            cf.ReturnOp (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) -> ()
+        (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.19.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=732, solved=0))] (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)])
+            graph.CallGraphOp @model.layers.19.self_attn (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)])
+            linalg.CPU.AddOp <name="model.layers.19.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)]) -> (%9300:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=758, solved=0))] (%9300:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)])
+            graph.CallGraphOp @model.layers.19.mlp (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)]) -> (%9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)])
+            linalg.CPU.AddOp <name="model.layers.19.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9300:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)]) -> (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.19.self_attn <CPU> [using_qnn:true, symbol:model.layers.19.self_attn] {
-        (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) {
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.q_proj">(%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)]) -> (%8856:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=714))] (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)]) -> (%8857:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=716))] (%8855:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=712)]) -> (%8858:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), )] (%8856:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)]) -> (%8856:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), )] (%8856:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)]) -> (%8859:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), )] (%8857:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)]) -> (%8857:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), )] (%8857:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)]) -> (%8860:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), )] (%8858:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%8858:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), )] (%8858:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%8861:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=718), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=720))] (%8859:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=718)]) -> (%8862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722))] (%8860:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=715)]) -> (%8863:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)])
-            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), )] (%8862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8864:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)])
-            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), )] (%8863:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8865:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721), outputs_0:QuantSpec(Raw(type: Float16), uuid=723), )] (%8865:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=721)]) -> (%8866:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=723)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=723), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), )] (%8866:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=723)]) -> (%8867:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), )] (%8867:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)]) -> (%8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(Raw(type: Float16), uuid=725), )] (%8861:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%8869:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=725)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=725), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726), )] (%8869:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=725)]) -> (%8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)])
-            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%8053:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)]) -> (%8871:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
-            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%8054:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) -> (%8872:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
-            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%8871:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%8873:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
-            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%8872:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8874:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
-            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), )] (%8864:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)], %8873:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%8875:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
-            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), inputs_1:QuantSpec(Raw(type: Float32), uuid=728), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), )] (%8875:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)], %8876:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=728), constant:[0.088388346]]) -> (%8877:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
-            linalg.CPU.ReduceMinOp <name="model.layers.19.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%8877:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)]) -> (%8878:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
-            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), inputs_1:QuantSpec(Raw(type: Int16), uuid=730), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%8878:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)], %8879:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=730), constant:[-20]]) -> (%8880:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
-            linalg.CPU.EqualOp <name="model.layers.19.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=731), outputs_0:QuantSpec(Raw(type: UInt8), uuid=732), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8881:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=731), constant:[0.55078125]]) -> (%8882:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=732)])
-            linalg.CPU.WhereOp <name="model.layers.19.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=732), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%8882:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=732)], %8877:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)], %8880:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%8883:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
-            linalg.CPU.SoftmaxOp <name="model.layers.19.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), )] (%8883:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%8884:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)])
-            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), )] (%8884:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=733)], %8874:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%8885:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), )] (%8885:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)]) -> (%8886:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), )] (%8886:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)]) -> (%8886:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735))] (%8886:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=734)]) -> (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)])
-            cf.ReturnOp (%8887:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=736)], %8868:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=724)], %8870:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=726)]) -> ()
+        (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)]) {
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=733, solved=0))] (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)]) -> (%9258:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735, solved=0))] (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)]) -> (%9259:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=737, solved=0))] (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)]) -> (%9260:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), )] (%9258:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)]) -> (%9258:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), )] (%9258:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)]) -> (%9261:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), )] (%9259:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)]) -> (%9259:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), )] (%9259:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)]) -> (%9262:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), )] (%9260:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)]) -> (%9260:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), )] (%9260:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)]) -> (%9263:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=740, solved=0))] (%9261:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)]) -> (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=742, solved=0))] (%9262:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)]) -> (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.NegOp <name="model.layers.19.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9266:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9266:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9267:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9267:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9268:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9269:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9269:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %9268:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9270:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.NegOp <name="model.layers.19.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9271:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9271:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9272:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9272:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9273:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %9273:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=743, solved=0), )] (%9275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9276:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=743)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=743, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), )] (%9276:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=743)]) -> (%9277:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), )] (%9277:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)]) -> (%9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=745, solved=0), )] (%9263:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)]) -> (%9280:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=745)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=745, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746, solved=0), )] (%9280:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=745)]) -> (%9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), )] (%8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)]) -> (%9283:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), )] (%8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)]) -> (%9284:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)])
+            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), )] (%9283:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%9285:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
+            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), )] (%9284:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9286:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)])
+            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), )] (%9270:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %9285:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%9287:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=748, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), )] (%9287:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)], %9288:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=748), constant:[0.088388346]]) -> (%9289:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)])
+            linalg.CPU.ReduceMinOp <name="model.layers.19.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), )] (%9289:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)]) -> (%9290:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=750, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), )] (%9290:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)], %9291:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=750), constant:[-20]]) -> (%9292:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)])
+            linalg.CPU.EqualOp <name="model.layers.19.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=751, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=752, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9293:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=751), constant:[0]]) -> (%9294:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=752)])
+            linalg.CPU.WhereOp <name="model.layers.19.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=752, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), )] (%9294:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=752)], %9289:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)], %9292:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)]) -> (%9295:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)])
+            linalg.CPU.SoftmaxOp <name="model.layers.19.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753, solved=0), )] (%9295:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)]) -> (%9296:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753)])
+            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), )] (%9296:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753)], %9286:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9297:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), )] (%9297:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)]) -> (%9298:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), )] (%9298:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)]) -> (%9298:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=755, solved=0))] (%9298:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)]) -> (%9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)])
+            cf.ReturnOp (%9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.19.mlp <CPU> [using_qnn:true, symbol:model.layers.19.mlp] {
-        (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) {
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=739))] (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8890:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)])
-            linalg.CPU.SiLUOp <name="model.layers.19.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), )] (%8890:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=740)]) -> (%8891:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)])
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=742))] (%8889:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%8892:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743)])
-            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), )] (%8891:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)], %8892:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743)]) -> (%8893:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)])
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=744))] (%8893:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=741)]) -> (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)])
-            cf.ReturnOp (%8894:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> ()
+        (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)]) -> (%9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)]) {
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=759, solved=0))] (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)]) -> (%9302:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=761, solved=0))] (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)]) -> (%9303:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)])
+            linalg.CPU.SigmoidOp <name="model.layers.19.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763, solved=0), )] (%9303:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)]) -> (%9304:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763)])
+            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), )] (%9303:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)], %9304:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763)]) -> (%9305:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)])
+            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), )] (%9305:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)], %9302:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760)]) -> (%9306:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=764, solved=0))] (%9306:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)]) -> (%9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)])
+            cf.ReturnOp (%9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.20 <CPU> [using_qnn:true, symbol:model.layers.20] {
-        (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.20.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747))] (%8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)])
-            graph.CallGraphOp @model.layers.20.self_attn (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)])
-            linalg.CPU.AddOp <name="model.layers.20.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), )] (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8895:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=745)]) -> (%8929:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=772))] (%8929:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)])
-            graph.CallGraphOp @model.layers.20.mlp (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
-            linalg.CPU.AddOp <name="model.layers.20.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), )] (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8929:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)]) -> (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
-            cf.ReturnOp (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) -> ()
+        (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.20.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=767, solved=0))] (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)])
+            graph.CallGraphOp @model.layers.20.self_attn (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)])
+            linalg.CPU.AddOp <name="model.layers.20.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)]) -> (%9352:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=793, solved=0))] (%9352:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)])
+            graph.CallGraphOp @model.layers.20.mlp (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)]) -> (%9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)])
+            linalg.CPU.AddOp <name="model.layers.20.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9352:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)]) -> (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.20.self_attn <CPU> [using_qnn:true, symbol:model.layers.20.self_attn] {
-        (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) {
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.q_proj">(%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%8897:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=748))] (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%8898:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=750))] (%8896:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%8899:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), )] (%8897:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)]) -> (%8897:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), )] (%8897:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)]) -> (%8900:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), )] (%8898:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%8898:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), )] (%8898:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%8901:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), )] (%8899:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)]) -> (%8899:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), )] (%8899:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)]) -> (%8902:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=752), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=754))] (%8900:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=752)]) -> (%8903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=756))] (%8901:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%8904:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)])
-            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), )] (%8903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8905:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)])
-            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), )] (%8904:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8906:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), outputs_0:QuantSpec(Raw(type: Float16), uuid=757), )] (%8906:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)]) -> (%8907:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=757)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), )] (%8907:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=757)]) -> (%8908:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), )] (%8908:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)]) -> (%8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751), outputs_0:QuantSpec(Raw(type: Float16), uuid=759), )] (%8902:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=751)]) -> (%8910:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=759)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=759), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760), )] (%8910:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=759)]) -> (%8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)])
-            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%8055:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)]) -> (%8912:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
-            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%8056:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) -> (%8913:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
-            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%8912:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%8914:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
-            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%8913:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8915:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
-            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), )] (%8905:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=753)], %8914:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%8916:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)])
-            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), inputs_1:QuantSpec(Raw(type: Float32), uuid=762), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), )] (%8916:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)], %8917:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=762), constant:[0.088388346]]) -> (%8918:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)])
-            linalg.CPU.ReduceMinOp <name="model.layers.20.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), )] (%8918:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)]) -> (%8919:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)])
-            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), inputs_1:QuantSpec(Raw(type: Int16), uuid=764), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), )] (%8919:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)], %8920:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=764), constant:[-20]]) -> (%8921:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)])
-            linalg.CPU.EqualOp <name="model.layers.20.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=765), outputs_0:QuantSpec(Raw(type: UInt8), uuid=766), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8922:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=765), constant:[0.71875]]) -> (%8923:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=766)])
-            linalg.CPU.WhereOp <name="model.layers.20.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=766), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), )] (%8923:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=766)], %8918:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)], %8921:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)]) -> (%8924:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)])
-            linalg.CPU.SoftmaxOp <name="model.layers.20.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%8924:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=763)]) -> (%8925:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
-            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), )] (%8925:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)], %8915:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%8926:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), )] (%8926:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)]) -> (%8927:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), )] (%8927:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)]) -> (%8927:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=769))] (%8927:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=768)]) -> (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)])
-            cf.ReturnOp (%8928:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=770)], %8909:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=758)], %8911:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=760)]) -> ()
+        (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)]) {
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=768, solved=0))] (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)]) -> (%9310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=770, solved=0))] (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)]) -> (%9311:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=772, solved=0))] (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)]) -> (%9312:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), )] (%9310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)]) -> (%9310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), )] (%9310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)]) -> (%9313:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), )] (%9311:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)]) -> (%9311:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), )] (%9311:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)]) -> (%9314:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), )] (%9312:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)]) -> (%9312:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), )] (%9312:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)]) -> (%9315:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=775, solved=0))] (%9313:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)]) -> (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=777, solved=0))] (%9314:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)]) -> (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.NegOp <name="model.layers.20.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9318:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9318:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9319:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9319:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9320:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9321:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9321:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %9320:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9322:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.NegOp <name="model.layers.20.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9323:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9323:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9324:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9324:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9325:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %9325:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=778, solved=0), )] (%9327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9328:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=778)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=778, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), )] (%9328:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=778)]) -> (%9329:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), )] (%9329:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)]) -> (%9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=780, solved=0), )] (%9315:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)]) -> (%9332:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=780)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=780, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781, solved=0), )] (%9332:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=780)]) -> (%9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), )] (%8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)]) -> (%9335:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), )] (%8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)]) -> (%9336:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)])
+            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), )] (%9335:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%9337:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
+            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), )] (%9336:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9338:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)])
+            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), )] (%9322:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %9337:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%9339:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=783, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), )] (%9339:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)], %9340:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=783), constant:[0.088388346]]) -> (%9341:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)])
+            linalg.CPU.ReduceMinOp <name="model.layers.20.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), )] (%9341:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)]) -> (%9342:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=785, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), )] (%9342:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)], %9343:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=785), constant:[-20]]) -> (%9344:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)])
+            linalg.CPU.EqualOp <name="model.layers.20.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=786, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=787, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9345:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=786), constant:[0]]) -> (%9346:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=787)])
+            linalg.CPU.WhereOp <name="model.layers.20.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=787, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), )] (%9346:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=787)], %9341:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)], %9344:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)]) -> (%9347:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)])
+            linalg.CPU.SoftmaxOp <name="model.layers.20.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788, solved=0), )] (%9347:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)]) -> (%9348:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788)])
+            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), )] (%9348:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788)], %9338:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9349:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), )] (%9349:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)]) -> (%9350:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), )] (%9350:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)]) -> (%9350:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=790, solved=0))] (%9350:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)]) -> (%9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)])
+            cf.ReturnOp (%9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.20.mlp <CPU> [using_qnn:true, symbol:model.layers.20.mlp] {
-        (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) {
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=773))] (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8931:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
-            linalg.CPU.SiLUOp <name="model.layers.20.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), )] (%8931:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)]) -> (%8932:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)])
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=776))] (%8930:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=771)]) -> (%8933:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)])
-            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), )] (%8932:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)], %8933:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)]) -> (%8934:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)])
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=778))] (%8934:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=775)]) -> (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
-            cf.ReturnOp (%8935:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> ()
+        (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)]) -> (%9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)]) {
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=794, solved=0))] (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)]) -> (%9354:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=796, solved=0))] (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)]) -> (%9355:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)])
+            linalg.CPU.SigmoidOp <name="model.layers.20.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798, solved=0), )] (%9355:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)]) -> (%9356:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798)])
+            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), )] (%9355:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)], %9356:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798)]) -> (%9357:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)])
+            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), )] (%9357:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)], %9354:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795)]) -> (%9358:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=799, solved=0))] (%9358:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)]) -> (%9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)])
+            cf.ReturnOp (%9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.21 <CPU> [using_qnn:true, symbol:model.layers.21] {
-        (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.21.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=781))] (%8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)])
-            graph.CallGraphOp @model.layers.21.self_attn (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
-            linalg.CPU.AddOp <name="model.layers.21.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), )] (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8936:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%8970:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806))] (%8970:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)])
-            graph.CallGraphOp @model.layers.21.mlp (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
-            linalg.CPU.AddOp <name="model.layers.21.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), )] (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8970:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
-            cf.ReturnOp (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> ()
+        (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.21.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=802, solved=0))] (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)])
+            graph.CallGraphOp @model.layers.21.self_attn (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)])
+            linalg.CPU.AddOp <name="model.layers.21.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)]) -> (%9404:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=828, solved=0))] (%9404:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)])
+            graph.CallGraphOp @model.layers.21.mlp (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)]) -> (%9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)])
+            linalg.CPU.AddOp <name="model.layers.21.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9404:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)]) -> (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.21.self_attn <CPU> [using_qnn:true, symbol:model.layers.21.self_attn] {
-        (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) {
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.q_proj">(%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%8938:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=782))] (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%8939:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=784))] (%8937:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%8940:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), )] (%8938:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)]) -> (%8938:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), )] (%8938:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)]) -> (%8941:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), )] (%8939:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)]) -> (%8939:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), )] (%8939:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)]) -> (%8942:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), )] (%8940:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%8940:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), )] (%8940:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%8943:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=786), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=788))] (%8941:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=786)]) -> (%8944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=790))] (%8942:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=783)]) -> (%8945:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
-            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), )] (%8944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8946:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
-            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), )] (%8945:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8947:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), outputs_0:QuantSpec(Raw(type: Float16), uuid=791), )] (%8947:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)]) -> (%8948:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=791)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=791), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), )] (%8948:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=791)]) -> (%8949:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), )] (%8949:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)]) -> (%8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(Raw(type: Float16), uuid=793), )] (%8943:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%8951:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=793)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=793), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), )] (%8951:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=793)]) -> (%8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
-            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%8057:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)]) -> (%8953:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
-            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%8058:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> (%8954:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
-            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%8953:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%8955:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
-            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%8954:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8956:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
-            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), )] (%8946:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)], %8955:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%8957:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)])
-            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), inputs_1:QuantSpec(Raw(type: Float32), uuid=796), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), )] (%8957:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)], %8958:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=796), constant:[0.088388346]]) -> (%8959:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)])
-            linalg.CPU.ReduceMinOp <name="model.layers.21.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%8959:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)]) -> (%8960:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
-            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), inputs_1:QuantSpec(Raw(type: Int16), uuid=798), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%8960:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)], %8961:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=798), constant:[-20]]) -> (%8962:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
-            linalg.CPU.EqualOp <name="model.layers.21.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=799), outputs_0:QuantSpec(Raw(type: UInt8), uuid=800), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8963:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=799), constant:[-0.80859375]]) -> (%8964:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=800)])
-            linalg.CPU.WhereOp <name="model.layers.21.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=800), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%8964:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=800)], %8959:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=795)], %8962:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)]) -> (%8965:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
-            linalg.CPU.SoftmaxOp <name="model.layers.21.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), )] (%8965:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)]) -> (%8966:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)])
-            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), )] (%8966:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=801)], %8956:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%8967:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), )] (%8967:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)]) -> (%8968:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), )] (%8968:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)]) -> (%8968:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803))] (%8968:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=802)]) -> (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
-            cf.ReturnOp (%8969:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)], %8950:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=792)], %8952:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> ()
+        (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)]) {
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803, solved=0))] (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)]) -> (%9362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=805, solved=0))] (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)]) -> (%9363:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807, solved=0))] (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)]) -> (%9364:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), )] (%9362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)]) -> (%9362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), )] (%9362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)]) -> (%9365:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), )] (%9363:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)]) -> (%9363:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), )] (%9363:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)]) -> (%9366:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), )] (%9364:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)]) -> (%9364:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), )] (%9364:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)]) -> (%9367:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=810, solved=0))] (%9365:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)]) -> (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=812, solved=0))] (%9366:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)]) -> (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.NegOp <name="model.layers.21.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9370:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9370:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9371:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9371:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9372:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9373:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9373:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %9372:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9374:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.NegOp <name="model.layers.21.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9375:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9375:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9376:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9376:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9377:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %9377:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=813, solved=0), )] (%9379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9380:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=813)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=813, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), )] (%9380:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=813)]) -> (%9381:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), )] (%9381:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)]) -> (%9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=815, solved=0), )] (%9367:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)]) -> (%9384:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=815)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=815, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816, solved=0), )] (%9384:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=815)]) -> (%9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), )] (%8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)]) -> (%9387:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), )] (%8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)]) -> (%9388:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)])
+            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), )] (%9387:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%9389:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
+            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), )] (%9388:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9390:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)])
+            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), )] (%9374:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %9389:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%9391:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=818, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), )] (%9391:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)], %9392:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=818), constant:[0.088388346]]) -> (%9393:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)])
+            linalg.CPU.ReduceMinOp <name="model.layers.21.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), )] (%9393:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)]) -> (%9394:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=820, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), )] (%9394:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)], %9395:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=820), constant:[-20]]) -> (%9396:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)])
+            linalg.CPU.EqualOp <name="model.layers.21.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=821, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=822, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9397:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=821), constant:[0]]) -> (%9398:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=822)])
+            linalg.CPU.WhereOp <name="model.layers.21.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=822, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), )] (%9398:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=822)], %9393:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)], %9396:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)]) -> (%9399:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)])
+            linalg.CPU.SoftmaxOp <name="model.layers.21.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823, solved=0), )] (%9399:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)]) -> (%9400:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823)])
+            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), )] (%9400:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823)], %9390:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9401:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), )] (%9401:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)]) -> (%9402:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), )] (%9402:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)]) -> (%9402:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=825, solved=0))] (%9402:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)]) -> (%9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)])
+            cf.ReturnOp (%9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.21.mlp <CPU> [using_qnn:true, symbol:model.layers.21.mlp] {
-        (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) {
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807))] (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8972:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)])
-            linalg.CPU.SiLUOp <name="model.layers.21.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), )] (%8972:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=808)]) -> (%8973:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)])
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=810))] (%8971:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=805)]) -> (%8974:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811)])
-            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), )] (%8973:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)], %8974:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=811)]) -> (%8975:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)])
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=812))] (%8975:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)]) -> (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)])
-            cf.ReturnOp (%8976:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> ()
+        (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)]) -> (%9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)]) {
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=829, solved=0))] (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)]) -> (%9406:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=831, solved=0))] (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)]) -> (%9407:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)])
+            linalg.CPU.SigmoidOp <name="model.layers.21.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833, solved=0), )] (%9407:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)]) -> (%9408:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833)])
+            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), )] (%9407:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)], %9408:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833)]) -> (%9409:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)])
+            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), )] (%9409:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)], %9406:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830)]) -> (%9410:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=834, solved=0))] (%9410:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)]) -> (%9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)])
+            cf.ReturnOp (%9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.22 <CPU> [using_qnn:true, symbol:model.layers.22] {
-        (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.22.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815))] (%8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)])
-            graph.CallGraphOp @model.layers.22.self_attn (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)])
-            linalg.CPU.AddOp <name="model.layers.22.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), )] (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8977:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=813)]) -> (%9011:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840))] (%9011:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)])
-            graph.CallGraphOp @model.layers.22.mlp (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            linalg.CPU.AddOp <name="model.layers.22.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), )] (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %9011:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)]) -> (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            cf.ReturnOp (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) -> ()
+        (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.22.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=837, solved=0))] (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)])
+            graph.CallGraphOp @model.layers.22.self_attn (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)])
+            linalg.CPU.AddOp <name="model.layers.22.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)]) -> (%9456:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=863, solved=0))] (%9456:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)])
+            graph.CallGraphOp @model.layers.22.mlp (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)]) -> (%9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)])
+            linalg.CPU.AddOp <name="model.layers.22.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9456:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)]) -> (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.22.self_attn <CPU> [using_qnn:true, symbol:model.layers.22.self_attn] {
-        (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) {
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.q_proj">(%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%8979:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=816))] (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%8980:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=818))] (%8978:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%8981:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), )] (%8979:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%8979:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), )] (%8979:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%8982:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), )] (%8980:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%8980:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), )] (%8980:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%8983:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), )] (%8981:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%8981:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), )] (%8981:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%8984:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=822))] (%8982:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%8985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=824))] (%8983:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%8986:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)])
-            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), )] (%8985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8987:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)])
-            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), )] (%8986:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%8988:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823), outputs_0:QuantSpec(Raw(type: Float16), uuid=825), )] (%8988:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=823)]) -> (%8989:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=825)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=825), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), )] (%8989:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=825)]) -> (%8990:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), )] (%8990:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> (%8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(Raw(type: Float16), uuid=827), )] (%8984:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%8992:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=827)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=827), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828), )] (%8992:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=827)]) -> (%8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)])
-            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%8059:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> (%8994:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
-            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%8060:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) -> (%8995:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
-            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%8994:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%8996:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
-            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%8995:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%8997:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
-            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%8987:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)], %8996:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%8998:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
-            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), inputs_1:QuantSpec(Raw(type: Float32), uuid=830), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%8998:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)], %8999:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=830), constant:[0.088388346]]) -> (%9000:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
-            linalg.CPU.ReduceMinOp <name="model.layers.22.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), )] (%9000:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)]) -> (%9001:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)])
-            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), inputs_1:QuantSpec(Raw(type: Int16), uuid=832), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), )] (%9001:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)], %9002:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=832), constant:[-20]]) -> (%9003:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)])
-            linalg.CPU.EqualOp <name="model.layers.22.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=833), outputs_0:QuantSpec(Raw(type: UInt8), uuid=834), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9004:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=833), constant:[-0.42773438]]) -> (%9005:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=834)])
-            linalg.CPU.WhereOp <name="model.layers.22.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=834), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), )] (%9005:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=834)], %9000:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)], %9003:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)]) -> (%9006:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)])
-            linalg.CPU.SoftmaxOp <name="model.layers.22.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), )] (%9006:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=831)]) -> (%9007:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)])
-            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%9007:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=835)], %8997:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%9008:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%9008:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%9009:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%9009:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%9009:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=837))] (%9009:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)])
-            cf.ReturnOp (%9010:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=838)], %8991:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %8993:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=828)]) -> ()
+        (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)]) {
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=838, solved=0))] (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)]) -> (%9414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=840, solved=0))] (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)]) -> (%9415:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=842, solved=0))] (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)]) -> (%9416:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), )] (%9414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)]) -> (%9414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), )] (%9414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)]) -> (%9417:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), )] (%9415:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)]) -> (%9415:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), )] (%9415:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)]) -> (%9418:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), )] (%9416:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)]) -> (%9416:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), )] (%9416:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)]) -> (%9419:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=845, solved=0))] (%9417:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)]) -> (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=847, solved=0))] (%9418:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)]) -> (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.NegOp <name="model.layers.22.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9422:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9422:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9423:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9423:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9424:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9425:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9425:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %9424:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9426:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.NegOp <name="model.layers.22.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9427:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9427:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9428:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9428:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9429:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %9429:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=848, solved=0), )] (%9431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9432:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=848)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=848, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), )] (%9432:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=848)]) -> (%9433:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), )] (%9433:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)]) -> (%9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=850, solved=0), )] (%9419:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)]) -> (%9436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=850)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=850, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851, solved=0), )] (%9436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=850)]) -> (%9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), )] (%8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)]) -> (%9439:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), )] (%8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)]) -> (%9440:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)])
+            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), )] (%9439:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%9441:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
+            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), )] (%9440:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9442:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)])
+            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), )] (%9426:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %9441:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%9443:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=853, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), )] (%9443:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)], %9444:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=853), constant:[0.088388346]]) -> (%9445:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)])
+            linalg.CPU.ReduceMinOp <name="model.layers.22.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), )] (%9445:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)]) -> (%9446:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=855, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), )] (%9446:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)], %9447:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=855), constant:[-20]]) -> (%9448:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)])
+            linalg.CPU.EqualOp <name="model.layers.22.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=856, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=857, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9449:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=856), constant:[0]]) -> (%9450:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=857)])
+            linalg.CPU.WhereOp <name="model.layers.22.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=857, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), )] (%9450:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=857)], %9445:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)], %9448:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)]) -> (%9451:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)])
+            linalg.CPU.SoftmaxOp <name="model.layers.22.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858, solved=0), )] (%9451:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)]) -> (%9452:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858)])
+            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), )] (%9452:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858)], %9442:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9453:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), )] (%9453:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)]) -> (%9454:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), )] (%9454:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)]) -> (%9454:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=860, solved=0))] (%9454:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)]) -> (%9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)])
+            cf.ReturnOp (%9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.22.mlp <CPU> [using_qnn:true, symbol:model.layers.22.mlp] {
-        (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) {
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=841))] (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9013:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)])
-            linalg.CPU.SiLUOp <name="model.layers.22.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), )] (%9013:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)]) -> (%9014:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)])
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=844))] (%9012:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%9015:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)])
-            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), )] (%9014:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)], %9015:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)]) -> (%9016:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)])
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=846))] (%9016:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=843)]) -> (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            cf.ReturnOp (%9017:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> ()
+        (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)]) -> (%9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)]) {
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=864, solved=0))] (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)]) -> (%9458:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=866, solved=0))] (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)]) -> (%9459:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)])
+            linalg.CPU.SigmoidOp <name="model.layers.22.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868, solved=0), )] (%9459:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)]) -> (%9460:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868)])
+            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), )] (%9459:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)], %9460:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868)]) -> (%9461:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)])
+            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), )] (%9461:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)], %9458:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865)]) -> (%9462:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=869, solved=0))] (%9462:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)]) -> (%9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)])
+            cf.ReturnOp (%9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.23 <CPU> [using_qnn:true, symbol:model.layers.23] {
-        (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.23.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849))] (%9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)])
-            graph.CallGraphOp @model.layers.23.self_attn (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)])
-            linalg.CPU.AddOp <name="model.layers.23.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), )] (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9018:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%9052:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874))] (%9052:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)])
-            graph.CallGraphOp @model.layers.23.mlp (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
-            linalg.CPU.AddOp <name="model.layers.23.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), )] (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9052:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
-            cf.ReturnOp (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) -> ()
+        (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.23.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=872, solved=0))] (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)])
+            graph.CallGraphOp @model.layers.23.self_attn (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)])
+            linalg.CPU.AddOp <name="model.layers.23.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)]) -> (%9508:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=898, solved=0))] (%9508:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)])
+            graph.CallGraphOp @model.layers.23.mlp (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)]) -> (%9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)])
+            linalg.CPU.AddOp <name="model.layers.23.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9508:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)]) -> (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.23.self_attn <CPU> [using_qnn:true, symbol:model.layers.23.self_attn] {
-        (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) {
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.q_proj">(%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)]) -> (%9020:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=850))] (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)]) -> (%9021:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=852))] (%9019:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=848)]) -> (%9022:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), )] (%9020:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)]) -> (%9020:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), )] (%9020:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)]) -> (%9023:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), )] (%9021:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)]) -> (%9021:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), )] (%9021:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)]) -> (%9024:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), )] (%9022:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)]) -> (%9022:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), )] (%9022:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)]) -> (%9025:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=854), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=856))] (%9023:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=854)]) -> (%9026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=858))] (%9024:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)]) -> (%9027:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)])
-            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), )] (%9026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9028:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)])
-            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), )] (%9027:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9029:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), outputs_0:QuantSpec(Raw(type: Float16), uuid=859), )] (%9029:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)]) -> (%9030:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=859)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=859), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), )] (%9030:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=859)]) -> (%9031:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), )] (%9031:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)]) -> (%9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853), outputs_0:QuantSpec(Raw(type: Float16), uuid=861), )] (%9025:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=853)]) -> (%9033:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=861)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=861), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862), )] (%9033:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=861)]) -> (%9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)])
-            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%8061:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)]) -> (%9035:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
-            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%8062:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) -> (%9036:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
-            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%9035:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9037:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
-            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%9036:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9038:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
-            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), )] (%9028:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=855)], %9037:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9039:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)])
-            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), inputs_1:QuantSpec(Raw(type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), )] (%9039:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)], %9040:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=864), constant:[0.088388346]]) -> (%9041:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)])
-            linalg.CPU.ReduceMinOp <name="model.layers.23.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), )] (%9041:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)]) -> (%9042:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)])
-            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), inputs_1:QuantSpec(Raw(type: Int16), uuid=866), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), )] (%9042:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)], %9043:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=866), constant:[-20]]) -> (%9044:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)])
-            linalg.CPU.EqualOp <name="model.layers.23.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=867), outputs_0:QuantSpec(Raw(type: UInt8), uuid=868), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9045:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=867), constant:[0.96484375]]) -> (%9046:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=868)])
-            linalg.CPU.WhereOp <name="model.layers.23.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=868), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), )] (%9046:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=868)], %9041:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)], %9044:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)]) -> (%9047:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)])
-            linalg.CPU.SoftmaxOp <name="model.layers.23.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), )] (%9047:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=865)]) -> (%9048:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)])
-            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%9048:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)], %9038:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%9049:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%9049:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%9050:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%9050:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%9050:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=871))] (%9050:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)])
-            cf.ReturnOp (%9051:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)], %9032:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=860)], %9034:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=862)]) -> ()
+        (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)]) {
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=873, solved=0))] (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)]) -> (%9466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875, solved=0))] (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)]) -> (%9467:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=877, solved=0))] (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)]) -> (%9468:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), )] (%9466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)]) -> (%9466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), )] (%9466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)]) -> (%9469:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), )] (%9467:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)]) -> (%9467:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), )] (%9467:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)]) -> (%9470:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), )] (%9468:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)]) -> (%9468:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), )] (%9468:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)]) -> (%9471:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=880, solved=0))] (%9469:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)]) -> (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=882, solved=0))] (%9470:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)]) -> (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.NegOp <name="model.layers.23.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9474:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9474:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9475:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9475:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9476:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9477:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9477:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %9476:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9478:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.NegOp <name="model.layers.23.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9479:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9479:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9480:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9480:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9481:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %9481:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=883, solved=0), )] (%9483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9484:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=883)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=883, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), )] (%9484:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=883)]) -> (%9485:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), )] (%9485:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)]) -> (%9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=885, solved=0), )] (%9471:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)]) -> (%9488:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=885)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=885, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886, solved=0), )] (%9488:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=885)]) -> (%9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), )] (%8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)]) -> (%9491:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), )] (%8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)]) -> (%9492:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)])
+            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), )] (%9491:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9493:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
+            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), )] (%9492:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9494:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)])
+            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), )] (%9478:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %9493:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9495:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=888, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), )] (%9495:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)], %9496:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=888), constant:[0.088388346]]) -> (%9497:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)])
+            linalg.CPU.ReduceMinOp <name="model.layers.23.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), )] (%9497:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)]) -> (%9498:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=890, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), )] (%9498:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)], %9499:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=890), constant:[-20]]) -> (%9500:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)])
+            linalg.CPU.EqualOp <name="model.layers.23.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=891, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=892, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9501:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=891), constant:[0]]) -> (%9502:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=892)])
+            linalg.CPU.WhereOp <name="model.layers.23.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=892, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), )] (%9502:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=892)], %9497:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)], %9500:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)]) -> (%9503:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)])
+            linalg.CPU.SoftmaxOp <name="model.layers.23.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893, solved=0), )] (%9503:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)]) -> (%9504:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893)])
+            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), )] (%9504:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893)], %9494:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9505:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), )] (%9505:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)]) -> (%9506:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), )] (%9506:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)]) -> (%9506:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=895, solved=0))] (%9506:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)]) -> (%9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)])
+            cf.ReturnOp (%9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.23.mlp <CPU> [using_qnn:true, symbol:model.layers.23.mlp] {
-        (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) {
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875))] (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9054:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)])
-            linalg.CPU.SiLUOp <name="model.layers.23.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), )] (%9054:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=876)]) -> (%9055:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=878))] (%9053:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=873)]) -> (%9056:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)])
-            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), )] (%9055:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)], %9056:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)]) -> (%9057:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=880))] (%9057:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)]) -> (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
-            cf.ReturnOp (%9058:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> ()
+        (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)]) -> (%9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)]) {
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=899, solved=0))] (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)]) -> (%9510:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=901, solved=0))] (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)]) -> (%9511:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)])
+            linalg.CPU.SigmoidOp <name="model.layers.23.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903, solved=0), )] (%9511:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)]) -> (%9512:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903)])
+            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), )] (%9511:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)], %9512:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903)]) -> (%9513:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)])
+            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), )] (%9513:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)], %9510:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900)]) -> (%9514:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=904, solved=0))] (%9514:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)]) -> (%9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)])
+            cf.ReturnOp (%9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.24 <CPU> [using_qnn:true, symbol:model.layers.24] {
-        (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.24.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=883))] (%9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)])
-            graph.CallGraphOp @model.layers.24.self_attn (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)])
-            linalg.CPU.AddOp <name="model.layers.24.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), )] (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9059:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)]) -> (%9093:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=908))] (%9093:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)])
-            graph.CallGraphOp @model.layers.24.mlp (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
-            linalg.CPU.AddOp <name="model.layers.24.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), )] (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9093:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)]) -> (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
-            cf.ReturnOp (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) -> ()
+        (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.24.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=907, solved=0))] (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)])
+            graph.CallGraphOp @model.layers.24.self_attn (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)])
+            linalg.CPU.AddOp <name="model.layers.24.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)]) -> (%9560:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=933, solved=0))] (%9560:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)])
+            graph.CallGraphOp @model.layers.24.mlp (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)]) -> (%9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)])
+            linalg.CPU.AddOp <name="model.layers.24.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9560:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)]) -> (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.24.self_attn <CPU> [using_qnn:true, symbol:model.layers.24.self_attn] {
-        (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) {
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.q_proj">(%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%9061:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=884))] (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%9062:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=886))] (%9060:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%9063:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), )] (%9061:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)]) -> (%9061:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), )] (%9061:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)]) -> (%9064:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), )] (%9062:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)]) -> (%9062:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), )] (%9062:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)]) -> (%9065:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), )] (%9063:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%9063:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), )] (%9063:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%9066:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=888), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=890))] (%9064:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=888)]) -> (%9067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=892))] (%9065:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=885)]) -> (%9068:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)])
-            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), )] (%9067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9069:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
-            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), )] (%9068:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9070:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891), outputs_0:QuantSpec(Raw(type: Float16), uuid=893), )] (%9070:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=891)]) -> (%9071:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=893)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=893), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), )] (%9071:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=893)]) -> (%9072:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), )] (%9072:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)]) -> (%9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(Raw(type: Float16), uuid=895), )] (%9066:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%9074:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=895)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=895), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896), )] (%9074:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=895)]) -> (%9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)])
-            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%8063:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)]) -> (%9076:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
-            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%8064:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) -> (%9077:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
-            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%9076:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9078:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
-            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%9077:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9079:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
-            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), )] (%9069:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)], %9078:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9080:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)])
-            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), inputs_1:QuantSpec(Raw(type: Float32), uuid=898), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), )] (%9080:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)], %9081:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=898), constant:[0.088388346]]) -> (%9082:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)])
-            linalg.CPU.ReduceMinOp <name="model.layers.24.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), )] (%9082:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)]) -> (%9083:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
-            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), inputs_1:QuantSpec(Raw(type: Int16), uuid=900), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), )] (%9083:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)], %9084:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=900), constant:[-20]]) -> (%9085:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
-            linalg.CPU.EqualOp <name="model.layers.24.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=901), outputs_0:QuantSpec(Raw(type: UInt8), uuid=902), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9086:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=901), constant:[0.07910156]]) -> (%9087:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=902)])
-            linalg.CPU.WhereOp <name="model.layers.24.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=902), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), )] (%9087:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=902)], %9082:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)], %9085:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)]) -> (%9088:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
-            linalg.CPU.SoftmaxOp <name="model.layers.24.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), )] (%9088:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)]) -> (%9089:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)])
-            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%9089:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=903)], %9079:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%9090:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%9090:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%9091:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%9091:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%9091:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=905))] (%9091:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)])
-            cf.ReturnOp (%9092:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=906)], %9073:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=894)], %9075:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=896)]) -> ()
+        (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)]) {
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=908, solved=0))] (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)]) -> (%9518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=910, solved=0))] (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)]) -> (%9519:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912, solved=0))] (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)]) -> (%9520:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), )] (%9518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)]) -> (%9518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), )] (%9518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)]) -> (%9521:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), )] (%9519:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)]) -> (%9519:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), )] (%9519:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)]) -> (%9522:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), )] (%9520:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)]) -> (%9520:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), )] (%9520:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)]) -> (%9523:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=915, solved=0))] (%9521:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)]) -> (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=917, solved=0))] (%9522:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)]) -> (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.NegOp <name="model.layers.24.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9526:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9526:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9527:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9527:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9528:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9529:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9529:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %9528:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9530:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.NegOp <name="model.layers.24.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9531:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9531:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9532:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9532:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9533:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %9533:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=918, solved=0), )] (%9535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9536:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=918)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=918, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), )] (%9536:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=918)]) -> (%9537:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), )] (%9537:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)]) -> (%9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=920, solved=0), )] (%9523:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)]) -> (%9540:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=920)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=920, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921, solved=0), )] (%9540:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=920)]) -> (%9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), )] (%8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)]) -> (%9543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), )] (%8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)]) -> (%9544:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)])
+            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), )] (%9543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
+            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), )] (%9544:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9546:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)])
+            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), )] (%9530:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %9545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9547:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=923, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), )] (%9547:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)], %9548:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=923), constant:[0.088388346]]) -> (%9549:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)])
+            linalg.CPU.ReduceMinOp <name="model.layers.24.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), )] (%9549:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)]) -> (%9550:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=925, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), )] (%9550:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)], %9551:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=925), constant:[-20]]) -> (%9552:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)])
+            linalg.CPU.EqualOp <name="model.layers.24.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=926, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=927, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9553:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=926), constant:[0]]) -> (%9554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=927)])
+            linalg.CPU.WhereOp <name="model.layers.24.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=927, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), )] (%9554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=927)], %9549:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)], %9552:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)]) -> (%9555:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)])
+            linalg.CPU.SoftmaxOp <name="model.layers.24.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928, solved=0), )] (%9555:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)]) -> (%9556:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928)])
+            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), )] (%9556:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928)], %9546:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9557:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), )] (%9557:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)]) -> (%9558:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), )] (%9558:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)]) -> (%9558:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=930, solved=0))] (%9558:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)]) -> (%9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)])
+            cf.ReturnOp (%9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.24.mlp <CPU> [using_qnn:true, symbol:model.layers.24.mlp] {
-        (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) {
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=909))] (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9095:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)])
-            linalg.CPU.SiLUOp <name="model.layers.24.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), )] (%9095:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=910)]) -> (%9096:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)])
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912))] (%9094:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)]) -> (%9097:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913)])
-            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), )] (%9096:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)], %9097:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=913)]) -> (%9098:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)])
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=914))] (%9098:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=911)]) -> (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)])
-            cf.ReturnOp (%9099:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> ()
+        (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)]) -> (%9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)]) {
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=934, solved=0))] (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)]) -> (%9562:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=936, solved=0))] (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)]) -> (%9563:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)])
+            linalg.CPU.SigmoidOp <name="model.layers.24.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938, solved=0), )] (%9563:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)]) -> (%9564:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938)])
+            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), )] (%9563:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)], %9564:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938)]) -> (%9565:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)])
+            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), )] (%9565:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)], %9562:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935)]) -> (%9566:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939, solved=0))] (%9566:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)]) -> (%9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)])
+            cf.ReturnOp (%9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.25 <CPU> [using_qnn:true, symbol:model.layers.25] {
-        (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.25.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=917))] (%9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)])
-            graph.CallGraphOp @model.layers.25.self_attn (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)])
-            linalg.CPU.AddOp <name="model.layers.25.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), )] (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9100:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=915)]) -> (%9134:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=942))] (%9134:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)])
-            graph.CallGraphOp @model.layers.25.mlp (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
-            linalg.CPU.AddOp <name="model.layers.25.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), )] (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9134:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)]) -> (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
-            cf.ReturnOp (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) -> ()
+        (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.25.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=942, solved=0))] (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)])
+            graph.CallGraphOp @model.layers.25.self_attn (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)])
+            linalg.CPU.AddOp <name="model.layers.25.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)]) -> (%9612:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=968, solved=0))] (%9612:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)])
+            graph.CallGraphOp @model.layers.25.mlp (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)]) -> (%9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)])
+            linalg.CPU.AddOp <name="model.layers.25.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9612:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)]) -> (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.25.self_attn <CPU> [using_qnn:true, symbol:model.layers.25.self_attn] {
-        (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) {
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.q_proj">(%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)]) -> (%9102:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=918))] (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)]) -> (%9103:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=920))] (%9101:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=916)]) -> (%9104:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), )] (%9102:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)]) -> (%9102:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), )] (%9102:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)]) -> (%9105:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), )] (%9103:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)]) -> (%9103:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), )] (%9103:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)]) -> (%9106:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), )] (%9104:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)]) -> (%9104:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), )] (%9104:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)]) -> (%9107:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=922), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=924))] (%9105:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=922)]) -> (%9108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=926))] (%9106:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=919)]) -> (%9109:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)])
-            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), )] (%9108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9110:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)])
-            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), )] (%9109:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9111:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925), outputs_0:QuantSpec(Raw(type: Float16), uuid=927), )] (%9111:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=925)]) -> (%9112:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=927)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=927), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), )] (%9112:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=927)]) -> (%9113:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), )] (%9113:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)]) -> (%9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921), outputs_0:QuantSpec(Raw(type: Float16), uuid=929), )] (%9107:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=921)]) -> (%9115:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=929)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=929), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930), )] (%9115:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=929)]) -> (%9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)])
-            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%8065:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)]) -> (%9117:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
-            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%8066:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) -> (%9118:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
-            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%9117:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9119:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
-            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%9118:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9120:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
-            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), )] (%9110:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=923)], %9119:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9121:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)])
-            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), inputs_1:QuantSpec(Raw(type: Float32), uuid=932), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), )] (%9121:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)], %9122:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=932), constant:[0.088388346]]) -> (%9123:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)])
-            linalg.CPU.ReduceMinOp <name="model.layers.25.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), )] (%9123:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)]) -> (%9124:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)])
-            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), inputs_1:QuantSpec(Raw(type: Int16), uuid=934), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), )] (%9124:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)], %9125:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=934), constant:[-20]]) -> (%9126:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)])
-            linalg.CPU.EqualOp <name="model.layers.25.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=935), outputs_0:QuantSpec(Raw(type: UInt8), uuid=936), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9127:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=935), constant:[-0.9921875]]) -> (%9128:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=936)])
-            linalg.CPU.WhereOp <name="model.layers.25.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=936), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), )] (%9128:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=936)], %9123:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=931)], %9126:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)]) -> (%9129:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)])
-            linalg.CPU.SoftmaxOp <name="model.layers.25.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), )] (%9129:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=933)]) -> (%9130:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)])
-            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), )] (%9130:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=937)], %9120:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%9131:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), )] (%9131:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)]) -> (%9132:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), )] (%9132:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)]) -> (%9132:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939))] (%9132:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=938)]) -> (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)])
-            cf.ReturnOp (%9133:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=940)], %9114:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=928)], %9116:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=930)]) -> ()
+        (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)]) {
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943, solved=0))] (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)]) -> (%9570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=945, solved=0))] (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)]) -> (%9571:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=947, solved=0))] (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)]) -> (%9572:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), )] (%9570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)]) -> (%9570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), )] (%9570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)]) -> (%9573:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), )] (%9571:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)]) -> (%9571:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), )] (%9571:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)]) -> (%9574:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), )] (%9572:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)]) -> (%9572:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), )] (%9572:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)]) -> (%9575:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=950, solved=0))] (%9573:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)]) -> (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=952, solved=0))] (%9574:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)]) -> (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.NegOp <name="model.layers.25.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9578:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9578:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9579:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9579:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9580:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9581:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9581:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %9580:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9582:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.NegOp <name="model.layers.25.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9583:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9583:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9584:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9584:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9585:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %9585:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=953, solved=0), )] (%9587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9588:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=953)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=953, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), )] (%9588:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=953)]) -> (%9589:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), )] (%9589:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)]) -> (%9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=955, solved=0), )] (%9575:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)]) -> (%9592:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=955)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=955, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956, solved=0), )] (%9592:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=955)]) -> (%9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), )] (%8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)]) -> (%9595:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), )] (%8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)]) -> (%9596:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)])
+            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), )] (%9595:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9597:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
+            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), )] (%9596:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9598:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)])
+            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), )] (%9582:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %9597:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9599:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=958, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), )] (%9599:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)], %9600:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=958), constant:[0.088388346]]) -> (%9601:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)])
+            linalg.CPU.ReduceMinOp <name="model.layers.25.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), )] (%9601:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)]) -> (%9602:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=960, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), )] (%9602:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)], %9603:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=960), constant:[-20]]) -> (%9604:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)])
+            linalg.CPU.EqualOp <name="model.layers.25.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=961, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=962, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9605:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=961), constant:[0]]) -> (%9606:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=962)])
+            linalg.CPU.WhereOp <name="model.layers.25.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=962, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), )] (%9606:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=962)], %9601:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)], %9604:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)]) -> (%9607:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)])
+            linalg.CPU.SoftmaxOp <name="model.layers.25.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963, solved=0), )] (%9607:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)]) -> (%9608:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963)])
+            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), )] (%9608:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963)], %9598:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9609:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), )] (%9609:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)]) -> (%9610:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), )] (%9610:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)]) -> (%9610:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=965, solved=0))] (%9610:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)]) -> (%9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)])
+            cf.ReturnOp (%9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.25.mlp <CPU> [using_qnn:true, symbol:model.layers.25.mlp] {
-        (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) {
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943))] (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9136:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)])
-            linalg.CPU.SiLUOp <name="model.layers.25.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), )] (%9136:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=944)]) -> (%9137:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)])
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=946))] (%9135:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=941)]) -> (%9138:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947)])
-            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), )] (%9137:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)], %9138:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=947)]) -> (%9139:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)])
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=948))] (%9139:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=945)]) -> (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)])
-            cf.ReturnOp (%9140:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> ()
+        (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)]) -> (%9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)]) {
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=969, solved=0))] (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)]) -> (%9614:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=971, solved=0))] (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)]) -> (%9615:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)])
+            linalg.CPU.SigmoidOp <name="model.layers.25.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973, solved=0), )] (%9615:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)]) -> (%9616:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973)])
+            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), )] (%9615:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)], %9616:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973)]) -> (%9617:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)])
+            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), )] (%9617:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)], %9614:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970)]) -> (%9618:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=974, solved=0))] (%9618:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)]) -> (%9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)])
+            cf.ReturnOp (%9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.26 <CPU> [using_qnn:true, symbol:model.layers.26] {
-        (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.26.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=951))] (%9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)])
-            graph.CallGraphOp @model.layers.26.self_attn (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)])
-            linalg.CPU.AddOp <name="model.layers.26.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), )] (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9141:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=949)]) -> (%9175:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=976))] (%9175:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)])
-            graph.CallGraphOp @model.layers.26.mlp (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
-            linalg.CPU.AddOp <name="model.layers.26.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), )] (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9175:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)]) -> (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
-            cf.ReturnOp (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) -> ()
+        (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.26.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=977, solved=0))] (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)])
+            graph.CallGraphOp @model.layers.26.self_attn (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)])
+            linalg.CPU.AddOp <name="model.layers.26.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)]) -> (%9664:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1003, solved=0))] (%9664:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)])
+            graph.CallGraphOp @model.layers.26.mlp (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)]) -> (%9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)])
+            linalg.CPU.AddOp <name="model.layers.26.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9664:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)]) -> (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.26.self_attn <CPU> [using_qnn:true, symbol:model.layers.26.self_attn] {
-        (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) {
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.q_proj">(%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)]) -> (%9143:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=952))] (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)]) -> (%9144:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=954))] (%9142:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=950)]) -> (%9145:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), )] (%9143:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)]) -> (%9143:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), )] (%9143:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)]) -> (%9146:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), )] (%9144:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)]) -> (%9144:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), )] (%9144:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)]) -> (%9147:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), )] (%9145:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)]) -> (%9145:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), )] (%9145:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)]) -> (%9148:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=956), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=958))] (%9146:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=956)]) -> (%9149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=960))] (%9147:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=953)]) -> (%9150:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)])
-            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), )] (%9149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9151:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)])
-            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), )] (%9150:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9152:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959), outputs_0:QuantSpec(Raw(type: Float16), uuid=961), )] (%9152:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=959)]) -> (%9153:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=961)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=961), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), )] (%9153:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=961)]) -> (%9154:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), )] (%9154:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)]) -> (%9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955), outputs_0:QuantSpec(Raw(type: Float16), uuid=963), )] (%9148:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=955)]) -> (%9156:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=963)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=963), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964), )] (%9156:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=963)]) -> (%9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)])
-            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%8067:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)]) -> (%9158:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
-            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%8068:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) -> (%9159:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
-            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%9158:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9160:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
-            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%9159:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9161:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
-            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), )] (%9151:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=957)], %9160:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9162:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)])
-            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), inputs_1:QuantSpec(Raw(type: Float32), uuid=966), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), )] (%9162:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)], %9163:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=966), constant:[0.088388346]]) -> (%9164:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)])
-            linalg.CPU.ReduceMinOp <name="model.layers.26.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), )] (%9164:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)]) -> (%9165:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)])
-            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), inputs_1:QuantSpec(Raw(type: Int16), uuid=968), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), )] (%9165:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)], %9166:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=968), constant:[-20]]) -> (%9167:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)])
-            linalg.CPU.EqualOp <name="model.layers.26.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=969), outputs_0:QuantSpec(Raw(type: UInt8), uuid=970), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9168:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=969), constant:[0.27929688]]) -> (%9169:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=970)])
-            linalg.CPU.WhereOp <name="model.layers.26.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=970), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), )] (%9169:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=970)], %9164:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=965)], %9167:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)]) -> (%9170:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)])
-            linalg.CPU.SoftmaxOp <name="model.layers.26.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), )] (%9170:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=967)]) -> (%9171:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)])
-            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), )] (%9171:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=971)], %9161:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%9172:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), )] (%9172:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)]) -> (%9173:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), )] (%9173:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)]) -> (%9173:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=973))] (%9173:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=972)]) -> (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)])
-            cf.ReturnOp (%9174:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=974)], %9155:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=962)], %9157:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=964)]) -> ()
+        (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)]) {
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=978, solved=0))] (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)]) -> (%9622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980, solved=0))] (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)]) -> (%9623:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982, solved=0))] (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)]) -> (%9624:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), )] (%9622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)]) -> (%9622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), )] (%9622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)]) -> (%9625:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), )] (%9623:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)]) -> (%9623:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), )] (%9623:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)]) -> (%9626:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), )] (%9624:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)]) -> (%9624:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), )] (%9624:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)]) -> (%9627:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=985, solved=0))] (%9625:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)]) -> (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=987, solved=0))] (%9626:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)]) -> (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.NegOp <name="model.layers.26.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9630:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9630:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9631:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9631:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9632:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9633:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9633:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %9632:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9634:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.NegOp <name="model.layers.26.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9635:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9635:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9636:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9636:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9637:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %9637:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=988, solved=0), )] (%9639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9640:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=988)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=988, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), )] (%9640:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=988)]) -> (%9641:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), )] (%9641:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)]) -> (%9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=990, solved=0), )] (%9627:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)]) -> (%9644:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=990)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=990, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991, solved=0), )] (%9644:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=990)]) -> (%9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), )] (%8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)]) -> (%9647:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), )] (%8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)]) -> (%9648:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)])
+            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), )] (%9647:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9649:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
+            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), )] (%9648:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9650:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)])
+            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), )] (%9634:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %9649:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9651:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=993, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), )] (%9651:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)], %9652:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=993), constant:[0.088388346]]) -> (%9653:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)])
+            linalg.CPU.ReduceMinOp <name="model.layers.26.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), )] (%9653:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)]) -> (%9654:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=995, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), )] (%9654:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)], %9655:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=995), constant:[-20]]) -> (%9656:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)])
+            linalg.CPU.EqualOp <name="model.layers.26.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=996, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=997, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9657:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=996), constant:[0]]) -> (%9658:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=997)])
+            linalg.CPU.WhereOp <name="model.layers.26.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=997, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), )] (%9658:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=997)], %9653:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)], %9656:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)]) -> (%9659:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)])
+            linalg.CPU.SoftmaxOp <name="model.layers.26.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998, solved=0), )] (%9659:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)]) -> (%9660:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998)])
+            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), )] (%9660:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998)], %9650:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9661:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), )] (%9661:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)]) -> (%9662:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), )] (%9662:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)]) -> (%9662:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1000, solved=0))] (%9662:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)]) -> (%9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)])
+            cf.ReturnOp (%9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.26.mlp <CPU> [using_qnn:true, symbol:model.layers.26.mlp] {
-        (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) {
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=977))] (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9177:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)])
-            linalg.CPU.SiLUOp <name="model.layers.26.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), )] (%9177:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=978)]) -> (%9178:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)])
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980))] (%9176:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=975)]) -> (%9179:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981)])
-            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), )] (%9178:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)], %9179:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=981)]) -> (%9180:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)])
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982))] (%9180:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=979)]) -> (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)])
-            cf.ReturnOp (%9181:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> ()
+        (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)]) -> (%9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)]) {
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1004, solved=0))] (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)]) -> (%9666:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1006, solved=0))] (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)]) -> (%9667:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)])
+            linalg.CPU.SigmoidOp <name="model.layers.26.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008, solved=0), )] (%9667:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)]) -> (%9668:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008)])
+            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), )] (%9667:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)], %9668:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008)]) -> (%9669:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)])
+            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), )] (%9669:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)], %9666:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005)]) -> (%9670:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1009, solved=0))] (%9670:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)]) -> (%9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)])
+            cf.ReturnOp (%9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.27 <CPU> [using_qnn:true, symbol:model.layers.27] {
-        (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.27.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=985))] (%9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)])
-            graph.CallGraphOp @model.layers.27.self_attn (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
-            linalg.CPU.AddOp <name="model.layers.27.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), )] (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9182:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=983)]) -> (%9216:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1010))] (%9216:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)])
-            graph.CallGraphOp @model.layers.27.mlp (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
-            linalg.CPU.AddOp <name="model.layers.27.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), )] (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9216:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)]) -> (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
-            cf.ReturnOp (%9223:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> ()
+        (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) {
+            linalg.CPU.RMSNormOp <name="model.layers.27.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1012, solved=0))] (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)])
+            graph.CallGraphOp @model.layers.27.self_attn (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
+            linalg.CPU.AddOp <name="model.layers.27.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)]) -> (%9716:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1038, solved=0))] (%9716:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)])
+            graph.CallGraphOp @model.layers.27.mlp (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)]) -> (%9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)])
+            linalg.CPU.AddOp <name="model.layers.27.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9716:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)]) -> (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            cf.ReturnOp (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.27.self_attn <CPU> [using_qnn:true, symbol:model.layers.27.self_attn] {
-        (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) {
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.q_proj">(%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)]) -> (%9184:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=986))] (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)]) -> (%9185:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=988))] (%9183:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=984)]) -> (%9186:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), )] (%9184:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)]) -> (%9184:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), )] (%9184:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)]) -> (%9187:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), )] (%9185:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)]) -> (%9185:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), )] (%9185:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)]) -> (%9188:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), )] (%9186:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)]) -> (%9186:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), )] (%9186:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)]) -> (%9189:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=990), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), weight_weight:QuantSpec(Raw(type: Int16PerTensor), uuid=992))] (%9187:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=990)]) -> (%9190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), weight_weight:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=994))] (%9188:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=987)]) -> (%9191:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)])
-            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), )] (%9190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9192:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)])
-            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), )] (%9191:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)], %8074:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)], %8075:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%9193:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993), outputs_0:QuantSpec(Raw(type: Float16), uuid=995), )] (%9193:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=993)]) -> (%9194:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=995)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=995), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), )] (%9194:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=995)]) -> (%9195:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), )] (%9195:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)]) -> (%9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989), outputs_0:QuantSpec(Raw(type: Float16), uuid=997), )] (%9189:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=989)]) -> (%9197:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=997)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=997), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998), )] (%9197:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=997)]) -> (%9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)])
-            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%8069:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)]) -> (%9199:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
-            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%8070:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> (%9200:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
-            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%9199:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9201:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
-            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%9200:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9202:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
-            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), )] (%9192:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=991)], %9201:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9203:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)])
-            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), inputs_1:QuantSpec(Raw(type: Float32), uuid=1000), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), )] (%9203:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)], %9204:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=1000), constant:[0.088388346]]) -> (%9205:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)])
-            linalg.CPU.ReduceMinOp <name="model.layers.27.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), )] (%9205:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)]) -> (%9206:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)])
-            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), inputs_1:QuantSpec(Raw(type: Int16), uuid=1002), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), )] (%9206:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)], %9207:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=1002), constant:[-20]]) -> (%9208:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)])
-            linalg.CPU.EqualOp <name="model.layers.27.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=1003), outputs_0:QuantSpec(Raw(type: UInt8), uuid=1004), )] (%8014:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9209:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=1003), constant:[0.890625]]) -> (%9210:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1004)])
-            linalg.CPU.WhereOp <name="model.layers.27.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=1004), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), )] (%9210:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1004)], %9205:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=999)], %9208:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)]) -> (%9211:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)])
-            linalg.CPU.SoftmaxOp <name="model.layers.27.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), )] (%9211:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1001)]) -> (%9212:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)])
-            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), )] (%9212:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1005)], %9202:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%9213:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), )] (%9213:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)]) -> (%9214:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), )] (%9214:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)]) -> (%9214:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1007))] (%9214:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1006)]) -> (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)])
-            cf.ReturnOp (%9215:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1008)], %9196:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=996)], %9198:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=998)]) -> ()
+        (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) {
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.q_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1013, solved=0))] (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)]) -> (%9674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1015, solved=0))] (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)]) -> (%9675:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1017, solved=0))] (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)]) -> (%9676:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), )] (%9674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)]) -> (%9674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), )] (%9674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)]) -> (%9677:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), )] (%9675:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)]) -> (%9675:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), )] (%9675:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)]) -> (%9678:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), )] (%9676:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)]) -> (%9676:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), )] (%9676:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)]) -> (%9679:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1020, solved=0))] (%9677:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)]) -> (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1022, solved=0))] (%9678:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)]) -> (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.NegOp <name="model.layers.27.self_attn.Neg.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9682:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9682:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9683:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9683:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9684:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9685:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9685:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %9684:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9686:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.NegOp <name="model.layers.27.self_attn.Neg.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9687:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9687:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9688:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9688:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9689:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %9689:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=1023, solved=0), )] (%9691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9692:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1023)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=1023, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), )] (%9692:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1023)]) -> (%9693:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), )] (%9693:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)]) -> (%9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=1025, solved=0), )] (%9679:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)]) -> (%9696:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1025)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=1025, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026, solved=0), )] (%9696:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1025)]) -> (%9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), )] (%8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)]) -> (%9699:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), )] (%8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> (%9700:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)])
+            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), )] (%9699:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9701:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
+            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), )] (%9700:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9702:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)])
+            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), )] (%9686:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %9701:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9703:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1028, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), )] (%9703:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)], %9704:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1028), constant:[0.088388346]]) -> (%9705:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)])
+            linalg.CPU.ReduceMinOp <name="model.layers.27.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), )] (%9705:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)]) -> (%9706:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1030, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), )] (%9706:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)], %9707:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1030), constant:[-20]]) -> (%9708:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)])
+            linalg.CPU.EqualOp <name="model.layers.27.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=1031, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=1032, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9709:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=1031), constant:[0]]) -> (%9710:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1032)])
+            linalg.CPU.WhereOp <name="model.layers.27.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=1032, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), )] (%9710:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1032)], %9705:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)], %9708:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)]) -> (%9711:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)])
+            linalg.CPU.SoftmaxOp <name="model.layers.27.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033, solved=0), )] (%9711:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)]) -> (%9712:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033)])
+            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), )] (%9712:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033)], %9702:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9713:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), )] (%9713:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)]) -> (%9714:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.5"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), )] (%9714:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)]) -> (%9714:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1035, solved=0))] (%9714:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)]) -> (%9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)])
+            cf.ReturnOp (%9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> ()
         }
     }
     graph.SubGraphOp @model.layers.27.mlp <CPU> [using_qnn:true, symbol:model.layers.27.mlp] {
-        (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) {
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1011))] (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9218:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)])
-            linalg.CPU.SiLUOp <name="model.layers.27.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), )] (%9218:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1012)]) -> (%9219:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)])
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1014))] (%9217:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1009)]) -> (%9220:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015)])
-            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), )] (%9219:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)], %9220:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1015)]) -> (%9221:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)])
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1016))] (%9221:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1013)]) -> (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)])
-            cf.ReturnOp (%9222:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=1017)]) -> ()
+        (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)]) -> (%9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)]) {
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1039, solved=0))] (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)]) -> (%9718:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1041, solved=0))] (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)]) -> (%9719:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)])
+            linalg.CPU.SigmoidOp <name="model.layers.27.mlp.Unknown.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043, solved=0), )] (%9719:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)]) -> (%9720:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043)])
+            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), )] (%9719:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)], %9720:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043)]) -> (%9721:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)])
+            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), )] (%9721:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)], %9718:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040)]) -> (%9722:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1044, solved=0))] (%9722:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)]) -> (%9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)])
+            cf.ReturnOp (%9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)]) -> ()
         }
     }
     //     ╔═════╗   
diff --git a/examples/qwen3_qnn_aot/qwen3_qnn_aot_quant_recipe.mir b/examples/qwen3_qnn_aot/qwen3_qnn_aot_quant_recipe.mir
index af9a88521..f498128cc 100644
--- a/examples/qwen3_qnn_aot/qwen3_qnn_aot_quant_recipe.mir
+++ b/examples/qwen3_qnn_aot/qwen3_qnn_aot_quant_recipe.mir
@@ -1,1902 +1,9 @@
 @main () -> () {
-    graph.SubGraphOp @init <notype> [symbol:init] {
-        () -> () {
-            tensor.CPU.register () -> (%105:tensor<[151936, 2048], Float32, CPU>[@model.embed_tokens.weight][symbol:model.embed_tokens.weight])[symbol:model.embed_tokens.weight]
-            tensor.CPU.register () -> (%76:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.q_proj.weight][symbol:model.layers.0.self_attn.q_proj.weight])[symbol:model.layers.0.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%133:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=66), symbol:model.layers.0.self_attn.k_proj.weight])[symbol:model.layers.0.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%179:tensor<[1024, 2048], Float32, CPU>[@model.layers.0.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68), symbol:model.layers.0.self_attn.v_proj.weight])[symbol:model.layers.0.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%269:tensor<[2048, 2048], Float32, CPU>[@model.layers.0.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=85), symbol:model.layers.0.self_attn.o_proj.weight])[symbol:model.layers.0.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%9:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=88), symbol:model.layers.0.mlp.gate_proj.weight])[symbol:model.layers.0.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%111:tensor<[6144, 2048], Float32, CPU>[@model.layers.0.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=91), symbol:model.layers.0.mlp.up_proj.weight])[symbol:model.layers.0.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%184:tensor<[2048, 6144], Float32, CPU>[@model.layers.0.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=93), symbol:model.layers.0.mlp.down_proj.weight])[symbol:model.layers.0.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%285:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.q_proj.weight][symbol:model.layers.1.self_attn.q_proj.weight])[symbol:model.layers.1.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%32:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96), symbol:model.layers.1.self_attn.k_proj.weight])[symbol:model.layers.1.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%154:tensor<[1024, 2048], Float32, CPU>[@model.layers.1.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=98), symbol:model.layers.1.self_attn.v_proj.weight])[symbol:model.layers.1.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%20:tensor<[2048, 2048], Float32, CPU>[@model.layers.1.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=115), symbol:model.layers.1.self_attn.o_proj.weight])[symbol:model.layers.1.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%245:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=118), symbol:model.layers.1.mlp.gate_proj.weight])[symbol:model.layers.1.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%230:tensor<[6144, 2048], Float32, CPU>[@model.layers.1.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=121), symbol:model.layers.1.mlp.up_proj.weight])[symbol:model.layers.1.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%43:tensor<[2048, 6144], Float32, CPU>[@model.layers.1.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=123), symbol:model.layers.1.mlp.down_proj.weight])[symbol:model.layers.1.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%221:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.q_proj.weight][symbol:model.layers.2.self_attn.q_proj.weight])[symbol:model.layers.2.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%103:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=126), symbol:model.layers.2.self_attn.k_proj.weight])[symbol:model.layers.2.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%47:tensor<[1024, 2048], Float32, CPU>[@model.layers.2.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=128), symbol:model.layers.2.self_attn.v_proj.weight])[symbol:model.layers.2.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%85:tensor<[2048, 2048], Float32, CPU>[@model.layers.2.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=145), symbol:model.layers.2.self_attn.o_proj.weight])[symbol:model.layers.2.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%252:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=148), symbol:model.layers.2.mlp.gate_proj.weight])[symbol:model.layers.2.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%24:tensor<[6144, 2048], Float32, CPU>[@model.layers.2.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=151), symbol:model.layers.2.mlp.up_proj.weight])[symbol:model.layers.2.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%28:tensor<[2048, 6144], Float32, CPU>[@model.layers.2.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=153), symbol:model.layers.2.mlp.down_proj.weight])[symbol:model.layers.2.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%283:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.q_proj.weight][symbol:model.layers.3.self_attn.q_proj.weight])[symbol:model.layers.3.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%48:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=156), symbol:model.layers.3.self_attn.k_proj.weight])[symbol:model.layers.3.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%244:tensor<[1024, 2048], Float32, CPU>[@model.layers.3.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=158), symbol:model.layers.3.self_attn.v_proj.weight])[symbol:model.layers.3.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%301:tensor<[2048, 2048], Float32, CPU>[@model.layers.3.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=175), symbol:model.layers.3.self_attn.o_proj.weight])[symbol:model.layers.3.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%129:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=178), symbol:model.layers.3.mlp.gate_proj.weight])[symbol:model.layers.3.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%188:tensor<[6144, 2048], Float32, CPU>[@model.layers.3.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=181), symbol:model.layers.3.mlp.up_proj.weight])[symbol:model.layers.3.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%97:tensor<[2048, 6144], Float32, CPU>[@model.layers.3.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=183), symbol:model.layers.3.mlp.down_proj.weight])[symbol:model.layers.3.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%164:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.q_proj.weight][symbol:model.layers.4.self_attn.q_proj.weight])[symbol:model.layers.4.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%148:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=186), symbol:model.layers.4.self_attn.k_proj.weight])[symbol:model.layers.4.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%279:tensor<[1024, 2048], Float32, CPU>[@model.layers.4.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=188), symbol:model.layers.4.self_attn.v_proj.weight])[symbol:model.layers.4.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%91:tensor<[2048, 2048], Float32, CPU>[@model.layers.4.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=205), symbol:model.layers.4.self_attn.o_proj.weight])[symbol:model.layers.4.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%189:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=208), symbol:model.layers.4.mlp.gate_proj.weight])[symbol:model.layers.4.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%156:tensor<[6144, 2048], Float32, CPU>[@model.layers.4.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=211), symbol:model.layers.4.mlp.up_proj.weight])[symbol:model.layers.4.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%153:tensor<[2048, 6144], Float32, CPU>[@model.layers.4.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=213), symbol:model.layers.4.mlp.down_proj.weight])[symbol:model.layers.4.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%78:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.q_proj.weight][symbol:model.layers.5.self_attn.q_proj.weight])[symbol:model.layers.5.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%72:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=216), symbol:model.layers.5.self_attn.k_proj.weight])[symbol:model.layers.5.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%289:tensor<[1024, 2048], Float32, CPU>[@model.layers.5.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=218), symbol:model.layers.5.self_attn.v_proj.weight])[symbol:model.layers.5.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%264:tensor<[2048, 2048], Float32, CPU>[@model.layers.5.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=235), symbol:model.layers.5.self_attn.o_proj.weight])[symbol:model.layers.5.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%4:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=238), symbol:model.layers.5.mlp.gate_proj.weight])[symbol:model.layers.5.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%308:tensor<[6144, 2048], Float32, CPU>[@model.layers.5.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=241), symbol:model.layers.5.mlp.up_proj.weight])[symbol:model.layers.5.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%74:tensor<[2048, 6144], Float32, CPU>[@model.layers.5.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=243), symbol:model.layers.5.mlp.down_proj.weight])[symbol:model.layers.5.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%59:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.q_proj.weight][symbol:model.layers.6.self_attn.q_proj.weight])[symbol:model.layers.6.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%208:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=246), symbol:model.layers.6.self_attn.k_proj.weight])[symbol:model.layers.6.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%238:tensor<[1024, 2048], Float32, CPU>[@model.layers.6.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=248), symbol:model.layers.6.self_attn.v_proj.weight])[symbol:model.layers.6.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%52:tensor<[2048, 2048], Float32, CPU>[@model.layers.6.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265), symbol:model.layers.6.self_attn.o_proj.weight])[symbol:model.layers.6.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%80:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=268), symbol:model.layers.6.mlp.gate_proj.weight])[symbol:model.layers.6.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%276:tensor<[6144, 2048], Float32, CPU>[@model.layers.6.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271), symbol:model.layers.6.mlp.up_proj.weight])[symbol:model.layers.6.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%227:tensor<[2048, 6144], Float32, CPU>[@model.layers.6.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=273), symbol:model.layers.6.mlp.down_proj.weight])[symbol:model.layers.6.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%287:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.q_proj.weight][symbol:model.layers.7.self_attn.q_proj.weight])[symbol:model.layers.7.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%135:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=276), symbol:model.layers.7.self_attn.k_proj.weight])[symbol:model.layers.7.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%300:tensor<[1024, 2048], Float32, CPU>[@model.layers.7.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=278), symbol:model.layers.7.self_attn.v_proj.weight])[symbol:model.layers.7.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%251:tensor<[2048, 2048], Float32, CPU>[@model.layers.7.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=295), symbol:model.layers.7.self_attn.o_proj.weight])[symbol:model.layers.7.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%155:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=298), symbol:model.layers.7.mlp.gate_proj.weight])[symbol:model.layers.7.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%218:tensor<[6144, 2048], Float32, CPU>[@model.layers.7.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=301), symbol:model.layers.7.mlp.up_proj.weight])[symbol:model.layers.7.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%275:tensor<[2048, 6144], Float32, CPU>[@model.layers.7.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=303), symbol:model.layers.7.mlp.down_proj.weight])[symbol:model.layers.7.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%165:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.q_proj.weight][symbol:model.layers.8.self_attn.q_proj.weight])[symbol:model.layers.8.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%194:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306), symbol:model.layers.8.self_attn.k_proj.weight])[symbol:model.layers.8.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%181:tensor<[1024, 2048], Float32, CPU>[@model.layers.8.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=308), symbol:model.layers.8.self_attn.v_proj.weight])[symbol:model.layers.8.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%197:tensor<[2048, 2048], Float32, CPU>[@model.layers.8.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=325), symbol:model.layers.8.self_attn.o_proj.weight])[symbol:model.layers.8.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%110:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=328), symbol:model.layers.8.mlp.gate_proj.weight])[symbol:model.layers.8.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%236:tensor<[6144, 2048], Float32, CPU>[@model.layers.8.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=331), symbol:model.layers.8.mlp.up_proj.weight])[symbol:model.layers.8.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%106:tensor<[2048, 6144], Float32, CPU>[@model.layers.8.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=333), symbol:model.layers.8.mlp.down_proj.weight])[symbol:model.layers.8.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%235:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.q_proj.weight][symbol:model.layers.9.self_attn.q_proj.weight])[symbol:model.layers.9.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%69:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=336), symbol:model.layers.9.self_attn.k_proj.weight])[symbol:model.layers.9.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%120:tensor<[1024, 2048], Float32, CPU>[@model.layers.9.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=338), symbol:model.layers.9.self_attn.v_proj.weight])[symbol:model.layers.9.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%205:tensor<[2048, 2048], Float32, CPU>[@model.layers.9.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=355), symbol:model.layers.9.self_attn.o_proj.weight])[symbol:model.layers.9.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%263:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=358), symbol:model.layers.9.mlp.gate_proj.weight])[symbol:model.layers.9.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%102:tensor<[6144, 2048], Float32, CPU>[@model.layers.9.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=361), symbol:model.layers.9.mlp.up_proj.weight])[symbol:model.layers.9.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%136:tensor<[2048, 6144], Float32, CPU>[@model.layers.9.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=363), symbol:model.layers.9.mlp.down_proj.weight])[symbol:model.layers.9.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%278:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.q_proj.weight][symbol:model.layers.10.self_attn.q_proj.weight])[symbol:model.layers.10.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%182:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=366), symbol:model.layers.10.self_attn.k_proj.weight])[symbol:model.layers.10.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%138:tensor<[1024, 2048], Float32, CPU>[@model.layers.10.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=368), symbol:model.layers.10.self_attn.v_proj.weight])[symbol:model.layers.10.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%233:tensor<[2048, 2048], Float32, CPU>[@model.layers.10.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=385), symbol:model.layers.10.self_attn.o_proj.weight])[symbol:model.layers.10.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%124:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=388), symbol:model.layers.10.mlp.gate_proj.weight])[symbol:model.layers.10.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%261:tensor<[6144, 2048], Float32, CPU>[@model.layers.10.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=391), symbol:model.layers.10.mlp.up_proj.weight])[symbol:model.layers.10.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%45:tensor<[2048, 6144], Float32, CPU>[@model.layers.10.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=393), symbol:model.layers.10.mlp.down_proj.weight])[symbol:model.layers.10.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%274:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.q_proj.weight][symbol:model.layers.11.self_attn.q_proj.weight])[symbol:model.layers.11.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%157:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=396), symbol:model.layers.11.self_attn.k_proj.weight])[symbol:model.layers.11.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%63:tensor<[1024, 2048], Float32, CPU>[@model.layers.11.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=398), symbol:model.layers.11.self_attn.v_proj.weight])[symbol:model.layers.11.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%118:tensor<[2048, 2048], Float32, CPU>[@model.layers.11.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=415), symbol:model.layers.11.self_attn.o_proj.weight])[symbol:model.layers.11.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%207:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=418), symbol:model.layers.11.mlp.gate_proj.weight])[symbol:model.layers.11.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%226:tensor<[6144, 2048], Float32, CPU>[@model.layers.11.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=421), symbol:model.layers.11.mlp.up_proj.weight])[symbol:model.layers.11.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%224:tensor<[2048, 6144], Float32, CPU>[@model.layers.11.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=423), symbol:model.layers.11.mlp.down_proj.weight])[symbol:model.layers.11.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%217:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.q_proj.weight][symbol:model.layers.12.self_attn.q_proj.weight])[symbol:model.layers.12.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%297:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=426), symbol:model.layers.12.self_attn.k_proj.weight])[symbol:model.layers.12.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%94:tensor<[1024, 2048], Float32, CPU>[@model.layers.12.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=428), symbol:model.layers.12.self_attn.v_proj.weight])[symbol:model.layers.12.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%49:tensor<[2048, 2048], Float32, CPU>[@model.layers.12.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=445), symbol:model.layers.12.self_attn.o_proj.weight])[symbol:model.layers.12.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%262:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=448), symbol:model.layers.12.mlp.gate_proj.weight])[symbol:model.layers.12.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%255:tensor<[6144, 2048], Float32, CPU>[@model.layers.12.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=451), symbol:model.layers.12.mlp.up_proj.weight])[symbol:model.layers.12.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%22:tensor<[2048, 6144], Float32, CPU>[@model.layers.12.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=453), symbol:model.layers.12.mlp.down_proj.weight])[symbol:model.layers.12.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%114:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.q_proj.weight][symbol:model.layers.13.self_attn.q_proj.weight])[symbol:model.layers.13.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%152:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=456), symbol:model.layers.13.self_attn.k_proj.weight])[symbol:model.layers.13.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%15:tensor<[1024, 2048], Float32, CPU>[@model.layers.13.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=458), symbol:model.layers.13.self_attn.v_proj.weight])[symbol:model.layers.13.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%250:tensor<[2048, 2048], Float32, CPU>[@model.layers.13.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475), symbol:model.layers.13.self_attn.o_proj.weight])[symbol:model.layers.13.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%247:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=478), symbol:model.layers.13.mlp.gate_proj.weight])[symbol:model.layers.13.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%98:tensor<[6144, 2048], Float32, CPU>[@model.layers.13.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=481), symbol:model.layers.13.mlp.up_proj.weight])[symbol:model.layers.13.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%193:tensor<[2048, 6144], Float32, CPU>[@model.layers.13.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=483), symbol:model.layers.13.mlp.down_proj.weight])[symbol:model.layers.13.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%209:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.q_proj.weight][symbol:model.layers.14.self_attn.q_proj.weight])[symbol:model.layers.14.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%38:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=486), symbol:model.layers.14.self_attn.k_proj.weight])[symbol:model.layers.14.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%232:tensor<[1024, 2048], Float32, CPU>[@model.layers.14.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=488), symbol:model.layers.14.self_attn.v_proj.weight])[symbol:model.layers.14.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%168:tensor<[2048, 2048], Float32, CPU>[@model.layers.14.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=505), symbol:model.layers.14.self_attn.o_proj.weight])[symbol:model.layers.14.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%37:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=508), symbol:model.layers.14.mlp.gate_proj.weight])[symbol:model.layers.14.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%147:tensor<[6144, 2048], Float32, CPU>[@model.layers.14.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=511), symbol:model.layers.14.mlp.up_proj.weight])[symbol:model.layers.14.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%163:tensor<[2048, 6144], Float32, CPU>[@model.layers.14.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=513), symbol:model.layers.14.mlp.down_proj.weight])[symbol:model.layers.14.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%46:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.q_proj.weight][symbol:model.layers.15.self_attn.q_proj.weight])[symbol:model.layers.15.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%268:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=516), symbol:model.layers.15.self_attn.k_proj.weight])[symbol:model.layers.15.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%117:tensor<[1024, 2048], Float32, CPU>[@model.layers.15.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=518), symbol:model.layers.15.self_attn.v_proj.weight])[symbol:model.layers.15.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%303:tensor<[2048, 2048], Float32, CPU>[@model.layers.15.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=535), symbol:model.layers.15.self_attn.o_proj.weight])[symbol:model.layers.15.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%260:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=538), symbol:model.layers.15.mlp.gate_proj.weight])[symbol:model.layers.15.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%42:tensor<[6144, 2048], Float32, CPU>[@model.layers.15.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=541), symbol:model.layers.15.mlp.up_proj.weight])[symbol:model.layers.15.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%290:tensor<[2048, 6144], Float32, CPU>[@model.layers.15.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=543), symbol:model.layers.15.mlp.down_proj.weight])[symbol:model.layers.15.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%17:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.q_proj.weight][symbol:model.layers.16.self_attn.q_proj.weight])[symbol:model.layers.16.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%228:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=546), symbol:model.layers.16.self_attn.k_proj.weight])[symbol:model.layers.16.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%66:tensor<[1024, 2048], Float32, CPU>[@model.layers.16.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=548), symbol:model.layers.16.self_attn.v_proj.weight])[symbol:model.layers.16.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%211:tensor<[2048, 2048], Float32, CPU>[@model.layers.16.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=565), symbol:model.layers.16.self_attn.o_proj.weight])[symbol:model.layers.16.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%130:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=568), symbol:model.layers.16.mlp.gate_proj.weight])[symbol:model.layers.16.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%79:tensor<[6144, 2048], Float32, CPU>[@model.layers.16.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=571), symbol:model.layers.16.mlp.up_proj.weight])[symbol:model.layers.16.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%248:tensor<[2048, 6144], Float32, CPU>[@model.layers.16.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=573), symbol:model.layers.16.mlp.down_proj.weight])[symbol:model.layers.16.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%64:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.q_proj.weight][symbol:model.layers.17.self_attn.q_proj.weight])[symbol:model.layers.17.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%237:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=576), symbol:model.layers.17.self_attn.k_proj.weight])[symbol:model.layers.17.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%6:tensor<[1024, 2048], Float32, CPU>[@model.layers.17.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=578), symbol:model.layers.17.self_attn.v_proj.weight])[symbol:model.layers.17.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%125:tensor<[2048, 2048], Float32, CPU>[@model.layers.17.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=595), symbol:model.layers.17.self_attn.o_proj.weight])[symbol:model.layers.17.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%177:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=598), symbol:model.layers.17.mlp.gate_proj.weight])[symbol:model.layers.17.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%26:tensor<[6144, 2048], Float32, CPU>[@model.layers.17.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=601), symbol:model.layers.17.mlp.up_proj.weight])[symbol:model.layers.17.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%25:tensor<[2048, 6144], Float32, CPU>[@model.layers.17.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=603), symbol:model.layers.17.mlp.down_proj.weight])[symbol:model.layers.17.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%273:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.q_proj.weight][symbol:model.layers.18.self_attn.q_proj.weight])[symbol:model.layers.18.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%284:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=606), symbol:model.layers.18.self_attn.k_proj.weight])[symbol:model.layers.18.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%18:tensor<[1024, 2048], Float32, CPU>[@model.layers.18.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=608), symbol:model.layers.18.self_attn.v_proj.weight])[symbol:model.layers.18.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%2:tensor<[2048, 2048], Float32, CPU>[@model.layers.18.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=625), symbol:model.layers.18.self_attn.o_proj.weight])[symbol:model.layers.18.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%166:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=628), symbol:model.layers.18.mlp.gate_proj.weight])[symbol:model.layers.18.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%271:tensor<[6144, 2048], Float32, CPU>[@model.layers.18.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=631), symbol:model.layers.18.mlp.up_proj.weight])[symbol:model.layers.18.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%112:tensor<[2048, 6144], Float32, CPU>[@model.layers.18.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=633), symbol:model.layers.18.mlp.down_proj.weight])[symbol:model.layers.18.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%8:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.q_proj.weight][symbol:model.layers.19.self_attn.q_proj.weight])[symbol:model.layers.19.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%286:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=636), symbol:model.layers.19.self_attn.k_proj.weight])[symbol:model.layers.19.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%50:tensor<[1024, 2048], Float32, CPU>[@model.layers.19.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=638), symbol:model.layers.19.self_attn.v_proj.weight])[symbol:model.layers.19.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%58:tensor<[2048, 2048], Float32, CPU>[@model.layers.19.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=655), symbol:model.layers.19.self_attn.o_proj.weight])[symbol:model.layers.19.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%281:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=658), symbol:model.layers.19.mlp.gate_proj.weight])[symbol:model.layers.19.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%82:tensor<[6144, 2048], Float32, CPU>[@model.layers.19.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=661), symbol:model.layers.19.mlp.up_proj.weight])[symbol:model.layers.19.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%173:tensor<[2048, 6144], Float32, CPU>[@model.layers.19.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=663), symbol:model.layers.19.mlp.down_proj.weight])[symbol:model.layers.19.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%280:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.q_proj.weight][symbol:model.layers.20.self_attn.q_proj.weight])[symbol:model.layers.20.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%253:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=666), symbol:model.layers.20.self_attn.k_proj.weight])[symbol:model.layers.20.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%239:tensor<[1024, 2048], Float32, CPU>[@model.layers.20.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=668), symbol:model.layers.20.self_attn.v_proj.weight])[symbol:model.layers.20.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%41:tensor<[2048, 2048], Float32, CPU>[@model.layers.20.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=685), symbol:model.layers.20.self_attn.o_proj.weight])[symbol:model.layers.20.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%172:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=688), symbol:model.layers.20.mlp.gate_proj.weight])[symbol:model.layers.20.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%299:tensor<[6144, 2048], Float32, CPU>[@model.layers.20.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=691), symbol:model.layers.20.mlp.up_proj.weight])[symbol:model.layers.20.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%123:tensor<[2048, 6144], Float32, CPU>[@model.layers.20.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=693), symbol:model.layers.20.mlp.down_proj.weight])[symbol:model.layers.20.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%295:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.q_proj.weight][symbol:model.layers.21.self_attn.q_proj.weight])[symbol:model.layers.21.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%139:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=696), symbol:model.layers.21.self_attn.k_proj.weight])[symbol:model.layers.21.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%142:tensor<[1024, 2048], Float32, CPU>[@model.layers.21.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=698), symbol:model.layers.21.self_attn.v_proj.weight])[symbol:model.layers.21.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%115:tensor<[2048, 2048], Float32, CPU>[@model.layers.21.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=715), symbol:model.layers.21.self_attn.o_proj.weight])[symbol:model.layers.21.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%259:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=718), symbol:model.layers.21.mlp.gate_proj.weight])[symbol:model.layers.21.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%162:tensor<[6144, 2048], Float32, CPU>[@model.layers.21.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=721), symbol:model.layers.21.mlp.up_proj.weight])[symbol:model.layers.21.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%183:tensor<[2048, 6144], Float32, CPU>[@model.layers.21.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=723), symbol:model.layers.21.mlp.down_proj.weight])[symbol:model.layers.21.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%89:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.q_proj.weight][symbol:model.layers.22.self_attn.q_proj.weight])[symbol:model.layers.22.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%36:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=726), symbol:model.layers.22.self_attn.k_proj.weight])[symbol:model.layers.22.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%204:tensor<[1024, 2048], Float32, CPU>[@model.layers.22.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=728), symbol:model.layers.22.self_attn.v_proj.weight])[symbol:model.layers.22.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%234:tensor<[2048, 2048], Float32, CPU>[@model.layers.22.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=745), symbol:model.layers.22.self_attn.o_proj.weight])[symbol:model.layers.22.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%198:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=748), symbol:model.layers.22.mlp.gate_proj.weight])[symbol:model.layers.22.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%254:tensor<[6144, 2048], Float32, CPU>[@model.layers.22.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=751), symbol:model.layers.22.mlp.up_proj.weight])[symbol:model.layers.22.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%31:tensor<[2048, 6144], Float32, CPU>[@model.layers.22.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=753), symbol:model.layers.22.mlp.down_proj.weight])[symbol:model.layers.22.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%109:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.q_proj.weight][symbol:model.layers.23.self_attn.q_proj.weight])[symbol:model.layers.23.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%39:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=756), symbol:model.layers.23.self_attn.k_proj.weight])[symbol:model.layers.23.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%83:tensor<[1024, 2048], Float32, CPU>[@model.layers.23.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=758), symbol:model.layers.23.self_attn.v_proj.weight])[symbol:model.layers.23.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%176:tensor<[2048, 2048], Float32, CPU>[@model.layers.23.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=775), symbol:model.layers.23.self_attn.o_proj.weight])[symbol:model.layers.23.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%169:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=778), symbol:model.layers.23.mlp.gate_proj.weight])[symbol:model.layers.23.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%243:tensor<[6144, 2048], Float32, CPU>[@model.layers.23.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=781), symbol:model.layers.23.mlp.up_proj.weight])[symbol:model.layers.23.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%149:tensor<[2048, 6144], Float32, CPU>[@model.layers.23.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=783), symbol:model.layers.23.mlp.down_proj.weight])[symbol:model.layers.23.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%11:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.q_proj.weight][symbol:model.layers.24.self_attn.q_proj.weight])[symbol:model.layers.24.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%61:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=786), symbol:model.layers.24.self_attn.k_proj.weight])[symbol:model.layers.24.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%81:tensor<[1024, 2048], Float32, CPU>[@model.layers.24.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=788), symbol:model.layers.24.self_attn.v_proj.weight])[symbol:model.layers.24.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%127:tensor<[2048, 2048], Float32, CPU>[@model.layers.24.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=805), symbol:model.layers.24.self_attn.o_proj.weight])[symbol:model.layers.24.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%141:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=808), symbol:model.layers.24.mlp.gate_proj.weight])[symbol:model.layers.24.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%126:tensor<[6144, 2048], Float32, CPU>[@model.layers.24.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=811), symbol:model.layers.24.mlp.up_proj.weight])[symbol:model.layers.24.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%34:tensor<[2048, 6144], Float32, CPU>[@model.layers.24.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=813), symbol:model.layers.24.mlp.down_proj.weight])[symbol:model.layers.24.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%206:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.q_proj.weight][symbol:model.layers.25.self_attn.q_proj.weight])[symbol:model.layers.25.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%27:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=816), symbol:model.layers.25.self_attn.k_proj.weight])[symbol:model.layers.25.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%121:tensor<[1024, 2048], Float32, CPU>[@model.layers.25.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=818), symbol:model.layers.25.self_attn.v_proj.weight])[symbol:model.layers.25.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%150:tensor<[2048, 2048], Float32, CPU>[@model.layers.25.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=835), symbol:model.layers.25.self_attn.o_proj.weight])[symbol:model.layers.25.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%249:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=838), symbol:model.layers.25.mlp.gate_proj.weight])[symbol:model.layers.25.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%159:tensor<[6144, 2048], Float32, CPU>[@model.layers.25.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=841), symbol:model.layers.25.mlp.up_proj.weight])[symbol:model.layers.25.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%267:tensor<[2048, 6144], Float32, CPU>[@model.layers.25.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=843), symbol:model.layers.25.mlp.down_proj.weight])[symbol:model.layers.25.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%265:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.q_proj.weight][symbol:model.layers.26.self_attn.q_proj.weight])[symbol:model.layers.26.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%190:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=846), symbol:model.layers.26.self_attn.k_proj.weight])[symbol:model.layers.26.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%119:tensor<[1024, 2048], Float32, CPU>[@model.layers.26.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=848), symbol:model.layers.26.self_attn.v_proj.weight])[symbol:model.layers.26.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%88:tensor<[2048, 2048], Float32, CPU>[@model.layers.26.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=865), symbol:model.layers.26.self_attn.o_proj.weight])[symbol:model.layers.26.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%96:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=868), symbol:model.layers.26.mlp.gate_proj.weight])[symbol:model.layers.26.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%62:tensor<[6144, 2048], Float32, CPU>[@model.layers.26.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=871), symbol:model.layers.26.mlp.up_proj.weight])[symbol:model.layers.26.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%220:tensor<[2048, 6144], Float32, CPU>[@model.layers.26.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=873), symbol:model.layers.26.mlp.down_proj.weight])[symbol:model.layers.26.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%185:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.q_proj.weight][symbol:model.layers.27.self_attn.q_proj.weight])[symbol:model.layers.27.self_attn.q_proj.weight]
-            tensor.CPU.register () -> (%12:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.k_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=876), symbol:model.layers.27.self_attn.k_proj.weight])[symbol:model.layers.27.self_attn.k_proj.weight]
-            tensor.CPU.register () -> (%54:tensor<[1024, 2048], Float32, CPU>[@model.layers.27.self_attn.v_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=878), symbol:model.layers.27.self_attn.v_proj.weight])[symbol:model.layers.27.self_attn.v_proj.weight]
-            tensor.CPU.register () -> (%60:tensor<[2048, 2048], Float32, CPU>[@model.layers.27.self_attn.o_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=895), symbol:model.layers.27.self_attn.o_proj.weight])[symbol:model.layers.27.self_attn.o_proj.weight]
-            tensor.CPU.register () -> (%144:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.gate_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=898), symbol:model.layers.27.mlp.gate_proj.weight])[symbol:model.layers.27.mlp.gate_proj.weight]
-            tensor.CPU.register () -> (%146:tensor<[6144, 2048], Float32, CPU>[@model.layers.27.mlp.up_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=901), symbol:model.layers.27.mlp.up_proj.weight])[symbol:model.layers.27.mlp.up_proj.weight]
-            tensor.CPU.register () -> (%195:tensor<[2048, 6144], Float32, CPU>[@model.layers.27.mlp.down_proj.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=903), symbol:model.layers.27.mlp.down_proj.weight])[symbol:model.layers.27.mlp.down_proj.weight]
-            tensor.CPU.register () -> (%101:tensor<[151936, 2048], Float32, CPU>[@lm_head.weight][quant_recipe:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=906), symbol:lm_head.weight])[symbol:lm_head.weight]
-        }
-    }
-    graph.SubGraphOp @deinit <notype> [symbol:deinit] {
-        () -> () {
-            
-        }
-    }
-    graph.CallGraphOp @model (%318:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=0)], %376:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)])
+    graph.CallGraphOp @model (%8206:tensor<[1, 32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8264:tensor<[32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=1)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
     graph.SubGraphOp @model <CPU> [using_qnn:true, symbol:model] {
-        (%318:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=0)], %376:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) {
-            linalg.CPU.EmbeddingOp <name="model.embed_tokens">(%318:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=0)]) -> (%377:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)])
-            linalg.CPU.CastTypeOp <name="model.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float32), uuid=59), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), )] (%377:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)]) -> (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)])
-            linalg.CPU.ViewOp <name="model.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int64), uuid=1), outputs_0:QuantSpec(Raw(type: Int64), uuid=1), )] (%376:tensor<[1, 32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)]) -> (%376:tensor<[32], Int64, CPU>[quant_recipe:QuantSpec(Raw(type: Int64), uuid=1)])
-            linalg.CPU.IndexOp <name="model.Index.0">(%316:tensor<[1, 1024, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=61)]) -> (%379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)])
-            linalg.CPU.IndexOp <name="model.Index.1">(%317:tensor<[1, 1024, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=63)]) -> (%380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)])
-            graph.CallGraphOp @model.layers.0 (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)])
-            graph.CallGraphOp @model.layers.1 (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)])
-            graph.CallGraphOp @model.layers.2 (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)])
-            graph.CallGraphOp @model.layers.3 (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)])
-            graph.CallGraphOp @model.layers.4 (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)])
-            graph.CallGraphOp @model.layers.5 (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)])
-            graph.CallGraphOp @model.layers.6 (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)])
-            graph.CallGraphOp @model.layers.7 (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)])
-            graph.CallGraphOp @model.layers.8 (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
-            graph.CallGraphOp @model.layers.9 (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)])
-            graph.CallGraphOp @model.layers.10 (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)])
-            graph.CallGraphOp @model.layers.11 (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)])
-            graph.CallGraphOp @model.layers.12 (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)])
-            graph.CallGraphOp @model.layers.13 (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)])
-            graph.CallGraphOp @model.layers.14 (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)])
-            graph.CallGraphOp @model.layers.15 (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)])
-            graph.CallGraphOp @model.layers.16 (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
-            graph.CallGraphOp @model.layers.17 (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)])
-            graph.CallGraphOp @model.layers.18 (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)])
-            graph.CallGraphOp @model.layers.19 (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)])
-            graph.CallGraphOp @model.layers.20 (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)])
-            graph.CallGraphOp @model.layers.21 (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)])
-            graph.CallGraphOp @model.layers.22 (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)])
-            graph.CallGraphOp @model.layers.23 (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)])
-            graph.CallGraphOp @model.layers.24 (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)])
-            graph.CallGraphOp @model.layers.25 (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
-            graph.CallGraphOp @model.layers.26 (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)])
-            graph.CallGraphOp @model.layers.27 (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)])
-            linalg.CPU.RMSNormOp <name="model.norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), )] (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> (%1529:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)])
-            linalg.CPU.LinearOp <name="lm_head"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=906)), using_qnn:true] (%1529:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=905)]) -> (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)])
-            cf.ReturnOp (%1530:tensor<[1, 32, 151936], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=907)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.0 <CPU> [using_qnn:true, symbol:model.layers.0] {
-        (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.0.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), )] (%378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)])
-            graph.CallGraphOp @model.layers.0.self_attn (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)])
-            linalg.CPU.AddOp <name="model.layers.0.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), )] (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)], %378:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=60)]) -> (%414:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), )] (%414:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)]) -> (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)])
-            graph.CallGraphOp @model.layers.0.mlp (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
-            linalg.CPU.AddOp <name="model.layers.0.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), )] (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %414:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)]) -> (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
-            cf.ReturnOp (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.0.self_attn <CPU> [using_qnn:true, symbol:model.layers.0.self_attn] {
-        (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)]) {
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.q_proj">(%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%382:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=70)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=66))] (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%383:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68))] (%381:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=65)]) -> (%384:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=70), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=70), )] (%382:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=70)]) -> (%382:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=70)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=70), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=70), )] (%382:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=70)]) -> (%385:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=70)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), )] (%383:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67)]) -> (%383:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), )] (%383:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67)]) -> (%386:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), )] (%384:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%384:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), )] (%384:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%387:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=70), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), )] (%385:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=70)]) -> (%388:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
-            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), )] (%386:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=67)]) -> (%389:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)])
-            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), )] (%388:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%390:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)])
-            linalg.CPU.RoPEOp <name="model.layers.0.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), )] (%389:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%391:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72), outputs_0:QuantSpec(Raw(type: Float16), uuid=73), )] (%391:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=72)]) -> (%392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=73)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=73), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74), )] (%392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=73)]) -> (%393:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74), )] (%393:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)]) -> (%394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69), outputs_0:QuantSpec(Raw(type: Float16), uuid=75), )] (%387:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=69)]) -> (%395:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=75)])
-            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=75), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76), )] (%395:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=75)]) -> (%396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)])
-            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%320:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)]) -> (%397:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
-            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%321:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)]) -> (%398:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
-            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), )] (%397:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%399:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
-            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), )] (%398:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%400:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)])
-            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77), )] (%390:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=71)], %399:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%401:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77)])
-            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77), inputs_1:QuantSpec(Raw(type: Float32), uuid=78), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77), )] (%401:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77)], %402:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=78), constant:[0.088388346]]) -> (%403:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77)])
-            linalg.CPU.ReduceMinOp <name="model.layers.0.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79), )] (%403:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77)]) -> (%404:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79)])
-            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79), inputs_1:QuantSpec(Raw(type: Int16), uuid=80), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79), )] (%404:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79)], %405:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=80), constant:[-20]]) -> (%406:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79)])
-            linalg.CPU.EqualOp <name="model.layers.0.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=81), outputs_0:QuantSpec(Raw(type: UInt8), uuid=82), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %407:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=81), constant:[0]]) -> (%408:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=82)])
-            linalg.CPU.WhereOp <name="model.layers.0.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=82), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79), )] (%408:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=82)], %403:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=77)], %406:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79)]) -> (%409:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79)])
-            linalg.CPU.SoftmaxOp <name="model.layers.0.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), )] (%409:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=79)]) -> (%410:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)])
-            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84), )] (%410:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=83)], %400:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=31)]) -> (%411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84)])
-            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84), )] (%411:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84)]) -> (%412:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84)])
-            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84), )] (%412:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84)]) -> (%412:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84)])
-            linalg.CPU.LinearOp <name="model.layers.0.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=85))] (%412:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=84)]) -> (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)])
-            cf.ReturnOp (%413:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=86)], %394:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=74)], %396:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=76)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.0.mlp <CPU> [using_qnn:true, symbol:model.layers.0.mlp] {
-        (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) {
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=88))] (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%416:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)])
-            linalg.CPU.SiLUOp <name="model.layers.0.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), )] (%416:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=89)]) -> (%417:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=91))] (%415:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=87)]) -> (%418:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92)])
-            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), )] (%417:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)], %418:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=92)]) -> (%419:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)])
-            linalg.CPU.LinearOp <name="model.layers.0.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=93))] (%419:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=90)]) -> (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)])
-            cf.ReturnOp (%420:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.1 <CPU> [using_qnn:true, symbol:model.layers.1] {
-        (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.1.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), )] (%421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) -> (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)])
-            graph.CallGraphOp @model.layers.1.self_attn (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)])
-            linalg.CPU.AddOp <name="model.layers.1.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), )] (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)], %421:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=94)]) -> (%455:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), )] (%455:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)]) -> (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)])
-            graph.CallGraphOp @model.layers.1.mlp (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
-            linalg.CPU.AddOp <name="model.layers.1.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), )] (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %455:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)]) -> (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
-            cf.ReturnOp (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.1.self_attn <CPU> [using_qnn:true, symbol:model.layers.1.self_attn] {
-        (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)]) {
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.q_proj">(%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)]) -> (%423:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=100)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96))] (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)]) -> (%424:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=98))] (%422:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=95)]) -> (%425:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=100), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=100), )] (%423:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=100)]) -> (%423:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=100)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=100), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=100), )] (%423:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=100)]) -> (%426:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=100)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), )] (%424:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)]) -> (%424:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), )] (%424:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)]) -> (%427:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), )] (%425:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%425:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), )] (%425:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%428:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=100), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101), )] (%426:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=100)]) -> (%429:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101)])
-            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), )] (%427:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=97)]) -> (%430:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)])
-            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101), )] (%429:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%431:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101)])
-            linalg.CPU.RoPEOp <name="model.layers.1.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), )] (%430:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%432:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102), outputs_0:QuantSpec(Raw(type: Float16), uuid=103), )] (%432:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=102)]) -> (%433:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=103)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=103), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104), )] (%433:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=103)]) -> (%434:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104), )] (%434:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)]) -> (%435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99), outputs_0:QuantSpec(Raw(type: Float16), uuid=105), )] (%428:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=99)]) -> (%436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=105)])
-            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=105), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106), )] (%436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=105)]) -> (%437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)])
-            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%322:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)]) -> (%438:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
-            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%323:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)]) -> (%439:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
-            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), )] (%438:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%440:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
-            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), )] (%439:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%441:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)])
-            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), )] (%431:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=101)], %440:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%442:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)])
-            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), inputs_1:QuantSpec(Raw(type: Float32), uuid=108), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), )] (%442:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)], %443:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=108), constant:[0.088388346]]) -> (%444:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)])
-            linalg.CPU.ReduceMinOp <name="model.layers.1.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), )] (%444:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)]) -> (%445:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
-            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), inputs_1:QuantSpec(Raw(type: Int16), uuid=110), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), )] (%445:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)], %446:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=110), constant:[-20]]) -> (%447:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
-            linalg.CPU.EqualOp <name="model.layers.1.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=111), outputs_0:QuantSpec(Raw(type: UInt8), uuid=112), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %448:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=111), constant:[0]]) -> (%449:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=112)])
-            linalg.CPU.WhereOp <name="model.layers.1.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=112), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), )] (%449:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=112)], %444:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=107)], %447:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)]) -> (%450:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)])
-            linalg.CPU.SoftmaxOp <name="model.layers.1.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=113), )] (%450:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=109)]) -> (%451:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=113)])
-            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=113), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), )] (%451:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=113)], %441:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=32)]) -> (%452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)])
-            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), )] (%452:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)]) -> (%453:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)])
-            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), )] (%453:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)]) -> (%453:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)])
-            linalg.CPU.LinearOp <name="model.layers.1.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=115))] (%453:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=114)]) -> (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)])
-            cf.ReturnOp (%454:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=116)], %435:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=104)], %437:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=106)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.1.mlp <CPU> [using_qnn:true, symbol:model.layers.1.mlp] {
-        (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) {
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=119), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=118))] (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%457:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=119)])
-            linalg.CPU.SiLUOp <name="model.layers.1.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=119), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120), )] (%457:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=119)]) -> (%458:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120)])
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=121))] (%456:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=117)]) -> (%459:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)])
-            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120), )] (%458:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120)], %459:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=122)]) -> (%460:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120)])
-            linalg.CPU.LinearOp <name="model.layers.1.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=123))] (%460:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=120)]) -> (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)])
-            cf.ReturnOp (%461:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.2 <CPU> [using_qnn:true, symbol:model.layers.2] {
-        (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.2.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), )] (%462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)])
-            graph.CallGraphOp @model.layers.2.self_attn (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)])
-            linalg.CPU.AddOp <name="model.layers.2.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146), )] (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)], %462:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=124)]) -> (%496:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147), )] (%496:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)]) -> (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147)])
-            graph.CallGraphOp @model.layers.2.mlp (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147)]) -> (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)])
-            linalg.CPU.AddOp <name="model.layers.2.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), )] (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %496:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)]) -> (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)])
-            cf.ReturnOp (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.2.self_attn <CPU> [using_qnn:true, symbol:model.layers.2.self_attn] {
-        (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)]) {
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.q_proj">(%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%464:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=130)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=126))] (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%465:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=128))] (%463:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=125)]) -> (%466:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=130), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=130), )] (%464:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=130)]) -> (%464:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=130)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=130), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=130), )] (%464:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=130)]) -> (%467:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=130)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), )] (%465:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)]) -> (%465:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), )] (%465:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)]) -> (%468:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), )] (%466:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)]) -> (%466:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), )] (%466:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)]) -> (%469:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=130), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), )] (%467:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=130)]) -> (%470:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)])
-            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), )] (%468:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=127)]) -> (%471:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)])
-            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), )] (%470:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%472:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)])
-            linalg.CPU.RoPEOp <name="model.layers.2.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), )] (%471:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%473:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132), outputs_0:QuantSpec(Raw(type: Float16), uuid=133), )] (%473:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=132)]) -> (%474:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=133)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=133), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134), )] (%474:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=133)]) -> (%475:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134), )] (%475:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)]) -> (%476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129), outputs_0:QuantSpec(Raw(type: Float16), uuid=135), )] (%469:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=129)]) -> (%477:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=135)])
-            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=135), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136), )] (%477:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=135)]) -> (%478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)])
-            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%324:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)]) -> (%479:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
-            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%325:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)]) -> (%480:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
-            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), )] (%479:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%481:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
-            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), )] (%480:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%482:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)])
-            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), )] (%472:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=131)], %481:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%483:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
-            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), inputs_1:QuantSpec(Raw(type: Float32), uuid=138), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), )] (%483:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)], %484:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=138), constant:[0.088388346]]) -> (%485:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)])
-            linalg.CPU.ReduceMinOp <name="model.layers.2.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%485:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)]) -> (%486:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
-            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), inputs_1:QuantSpec(Raw(type: Int16), uuid=140), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%486:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)], %487:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=140), constant:[-20]]) -> (%488:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
-            linalg.CPU.EqualOp <name="model.layers.2.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=141), outputs_0:QuantSpec(Raw(type: UInt8), uuid=142), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %489:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=141), constant:[0]]) -> (%490:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=142)])
-            linalg.CPU.WhereOp <name="model.layers.2.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=142), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), )] (%490:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=142)], %485:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=137)], %488:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%491:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)])
-            linalg.CPU.SoftmaxOp <name="model.layers.2.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), )] (%491:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=139)]) -> (%492:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)])
-            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), )] (%492:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=143)], %482:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=33)]) -> (%493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144)])
-            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), )] (%493:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144)]) -> (%494:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144)])
-            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), )] (%494:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144)]) -> (%494:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144)])
-            linalg.CPU.LinearOp <name="model.layers.2.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=145))] (%494:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=144)]) -> (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)])
-            cf.ReturnOp (%495:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=146)], %476:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=134)], %478:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=136)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.2.mlp <CPU> [using_qnn:true, symbol:model.layers.2.mlp] {
-        (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147)]) -> (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)]) {
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=148))] (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147)]) -> (%498:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)])
-            linalg.CPU.SiLUOp <name="model.layers.2.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), )] (%498:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=149)]) -> (%499:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)])
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=152), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=151))] (%497:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=147)]) -> (%500:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=152)])
-            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=152), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), )] (%499:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)], %500:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=152)]) -> (%501:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)])
-            linalg.CPU.LinearOp <name="model.layers.2.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=153))] (%501:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=150)]) -> (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)])
-            cf.ReturnOp (%502:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.3 <CPU> [using_qnn:true, symbol:model.layers.3] {
-        (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.3.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), )] (%503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)]) -> (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)])
-            graph.CallGraphOp @model.layers.3.self_attn (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)])
-            linalg.CPU.AddOp <name="model.layers.3.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), )] (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)], %503:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=154)]) -> (%537:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), )] (%537:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)]) -> (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)])
-            graph.CallGraphOp @model.layers.3.mlp (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)]) -> (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)])
-            linalg.CPU.AddOp <name="model.layers.3.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), )] (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %537:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)]) -> (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)])
-            cf.ReturnOp (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.3.self_attn <CPU> [using_qnn:true, symbol:model.layers.3.self_attn] {
-        (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)]) {
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.q_proj">(%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)]) -> (%505:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=160)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=156))] (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)]) -> (%506:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=158))] (%504:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=155)]) -> (%507:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=160), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=160), )] (%505:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=160)]) -> (%505:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=160)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=160), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=160), )] (%505:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=160)]) -> (%508:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=160)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), )] (%506:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)]) -> (%506:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), )] (%506:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)]) -> (%509:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), )] (%507:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%507:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), )] (%507:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%510:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=160), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161), )] (%508:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=160)]) -> (%511:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161)])
-            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), )] (%509:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=157)]) -> (%512:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)])
-            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161), )] (%511:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%513:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161)])
-            linalg.CPU.RoPEOp <name="model.layers.3.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), )] (%512:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%514:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162), outputs_0:QuantSpec(Raw(type: Float16), uuid=163), )] (%514:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=162)]) -> (%515:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=163)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=163), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164), )] (%515:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=163)]) -> (%516:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164), )] (%516:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)]) -> (%517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159), outputs_0:QuantSpec(Raw(type: Float16), uuid=165), )] (%510:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=159)]) -> (%518:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=165)])
-            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=165), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166), )] (%518:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=165)]) -> (%519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)])
-            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%326:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)]) -> (%520:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
-            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%327:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)]) -> (%521:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
-            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), )] (%520:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%522:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
-            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), )] (%521:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%523:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)])
-            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), )] (%513:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=161)], %522:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%524:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
-            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), inputs_1:QuantSpec(Raw(type: Float32), uuid=168), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), )] (%524:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %525:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=168), constant:[0.088388346]]) -> (%526:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)])
-            linalg.CPU.ReduceMinOp <name="model.layers.3.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), )] (%526:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)]) -> (%527:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169)])
-            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), inputs_1:QuantSpec(Raw(type: Int16), uuid=170), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), )] (%527:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169)], %528:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=170), constant:[-20]]) -> (%529:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169)])
-            linalg.CPU.EqualOp <name="model.layers.3.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=171), outputs_0:QuantSpec(Raw(type: UInt8), uuid=172), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %530:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=171), constant:[0]]) -> (%531:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=172)])
-            linalg.CPU.WhereOp <name="model.layers.3.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=172), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), )] (%531:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=172)], %526:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=167)], %529:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169)]) -> (%532:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169)])
-            linalg.CPU.SoftmaxOp <name="model.layers.3.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), )] (%532:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=169)]) -> (%533:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)])
-            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), )] (%533:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=173)], %523:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=34)]) -> (%534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)])
-            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), )] (%534:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)]) -> (%535:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)])
-            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), )] (%535:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)]) -> (%535:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)])
-            linalg.CPU.LinearOp <name="model.layers.3.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=175))] (%535:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=174)]) -> (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)])
-            cf.ReturnOp (%536:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=176)], %517:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=164)], %519:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=166)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.3.mlp <CPU> [using_qnn:true, symbol:model.layers.3.mlp] {
-        (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)]) -> (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)]) {
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=179), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=178))] (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)]) -> (%539:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=179)])
-            linalg.CPU.SiLUOp <name="model.layers.3.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=179), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180), )] (%539:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=179)]) -> (%540:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180)])
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=181))] (%538:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=177)]) -> (%541:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)])
-            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180), )] (%540:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180)], %541:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=182)]) -> (%542:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180)])
-            linalg.CPU.LinearOp <name="model.layers.3.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=183))] (%542:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=180)]) -> (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)])
-            cf.ReturnOp (%543:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.4 <CPU> [using_qnn:true, symbol:model.layers.4] {
-        (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.4.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), )] (%544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)]) -> (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)])
-            graph.CallGraphOp @model.layers.4.self_attn (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)])
-            linalg.CPU.AddOp <name="model.layers.4.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), )] (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)], %544:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=184)]) -> (%578:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), )] (%578:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)]) -> (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)])
-            graph.CallGraphOp @model.layers.4.mlp (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)])
-            linalg.CPU.AddOp <name="model.layers.4.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214), )] (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)], %578:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)]) -> (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)])
-            cf.ReturnOp (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.4.self_attn <CPU> [using_qnn:true, symbol:model.layers.4.self_attn] {
-        (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)]) {
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.q_proj">(%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%546:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=190)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=186))] (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%547:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=188))] (%545:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=185)]) -> (%548:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=190), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=190), )] (%546:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=190)]) -> (%546:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=190)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=190), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=190), )] (%546:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=190)]) -> (%549:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=190)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187), )] (%547:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187)]) -> (%547:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187), )] (%547:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187)]) -> (%550:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%548:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)]) -> (%548:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), )] (%548:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)]) -> (%551:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=190), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), )] (%549:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=190)]) -> (%552:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)])
-            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), )] (%550:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=187)]) -> (%553:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
-            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), )] (%552:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%554:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)])
-            linalg.CPU.RoPEOp <name="model.layers.4.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), )] (%553:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%555:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192), outputs_0:QuantSpec(Raw(type: Float16), uuid=193), )] (%555:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=192)]) -> (%556:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=193)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=193), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194), )] (%556:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=193)]) -> (%557:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194), )] (%557:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)]) -> (%558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189), outputs_0:QuantSpec(Raw(type: Float16), uuid=195), )] (%551:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=189)]) -> (%559:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=195)])
-            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=195), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196), )] (%559:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=195)]) -> (%560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)])
-            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%328:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)]) -> (%561:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
-            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%329:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)]) -> (%562:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
-            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), )] (%561:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%563:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
-            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), )] (%562:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%564:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)])
-            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), )] (%554:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=191)], %563:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%565:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)])
-            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), inputs_1:QuantSpec(Raw(type: Float32), uuid=198), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), )] (%565:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)], %566:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=198), constant:[0.088388346]]) -> (%567:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)])
-            linalg.CPU.ReduceMinOp <name="model.layers.4.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), )] (%567:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)]) -> (%568:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)])
-            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), inputs_1:QuantSpec(Raw(type: Int16), uuid=200), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), )] (%568:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)], %569:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=200), constant:[-20]]) -> (%570:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)])
-            linalg.CPU.EqualOp <name="model.layers.4.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=201), outputs_0:QuantSpec(Raw(type: UInt8), uuid=202), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %571:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=201), constant:[0]]) -> (%572:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=202)])
-            linalg.CPU.WhereOp <name="model.layers.4.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=202), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), )] (%572:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=202)], %567:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=197)], %570:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)]) -> (%573:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)])
-            linalg.CPU.SoftmaxOp <name="model.layers.4.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203), )] (%573:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=199)]) -> (%574:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203)])
-            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), )] (%574:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=203)], %564:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=35)]) -> (%575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)])
-            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), )] (%575:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)]) -> (%576:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)])
-            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), )] (%576:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)]) -> (%576:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)])
-            linalg.CPU.LinearOp <name="model.layers.4.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=205))] (%576:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=204)]) -> (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)])
-            cf.ReturnOp (%577:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=206)], %558:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=194)], %560:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=196)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.4.mlp <CPU> [using_qnn:true, symbol:model.layers.4.mlp] {
-        (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)]) {
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=208))] (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%580:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)])
-            linalg.CPU.SiLUOp <name="model.layers.4.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), )] (%580:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=209)]) -> (%581:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)])
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=211))] (%579:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=207)]) -> (%582:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212)])
-            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), )] (%581:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)], %582:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=212)]) -> (%583:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)])
-            linalg.CPU.LinearOp <name="model.layers.4.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=213))] (%583:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=210)]) -> (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)])
-            cf.ReturnOp (%584:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.5 <CPU> [using_qnn:true, symbol:model.layers.5] {
-        (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.5.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215), )] (%585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)]) -> (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215)])
-            graph.CallGraphOp @model.layers.5.self_attn (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)])
-            linalg.CPU.AddOp <name="model.layers.5.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), )] (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %585:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=214)]) -> (%619:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237), )] (%619:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237)])
-            graph.CallGraphOp @model.layers.5.mlp (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237)]) -> (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)])
-            linalg.CPU.AddOp <name="model.layers.5.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), )] (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %619:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)]) -> (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)])
-            cf.ReturnOp (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.5.self_attn <CPU> [using_qnn:true, symbol:model.layers.5.self_attn] {
-        (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)]) {
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.q_proj">(%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215)]) -> (%587:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=220)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=216))] (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215)]) -> (%588:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=218))] (%586:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=215)]) -> (%589:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=220), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=220), )] (%587:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=220)]) -> (%587:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=220)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=220), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=220), )] (%587:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=220)]) -> (%590:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=220)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), )] (%588:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)]) -> (%588:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), )] (%588:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)]) -> (%591:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%589:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%589:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), )] (%589:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%592:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=220), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221), )] (%590:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=220)]) -> (%593:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221)])
-            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222), )] (%591:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=217)]) -> (%594:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222)])
-            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221), )] (%593:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%595:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221)])
-            linalg.CPU.RoPEOp <name="model.layers.5.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222), )] (%594:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%596:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222), outputs_0:QuantSpec(Raw(type: Float16), uuid=223), )] (%596:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=222)]) -> (%597:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=223)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=223), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224), )] (%597:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=223)]) -> (%598:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224), )] (%598:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)]) -> (%599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219), outputs_0:QuantSpec(Raw(type: Float16), uuid=225), )] (%592:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=219)]) -> (%600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=225)])
-            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=225), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226), )] (%600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=225)]) -> (%601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)])
-            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%330:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)]) -> (%602:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
-            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%331:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)]) -> (%603:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
-            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), )] (%602:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%604:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
-            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), )] (%603:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%605:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)])
-            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), )] (%595:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=221)], %604:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%606:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)])
-            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), inputs_1:QuantSpec(Raw(type: Float32), uuid=228), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), )] (%606:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)], %607:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=228), constant:[0.088388346]]) -> (%608:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)])
-            linalg.CPU.ReduceMinOp <name="model.layers.5.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), )] (%608:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)]) -> (%609:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)])
-            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), inputs_1:QuantSpec(Raw(type: Int16), uuid=230), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), )] (%609:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)], %610:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=230), constant:[-20]]) -> (%611:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)])
-            linalg.CPU.EqualOp <name="model.layers.5.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=231), outputs_0:QuantSpec(Raw(type: UInt8), uuid=232), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %612:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=231), constant:[0]]) -> (%613:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=232)])
-            linalg.CPU.WhereOp <name="model.layers.5.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=232), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), )] (%613:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=232)], %608:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=227)], %611:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)]) -> (%614:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)])
-            linalg.CPU.SoftmaxOp <name="model.layers.5.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233), )] (%614:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=229)]) -> (%615:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233)])
-            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), )] (%615:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=233)], %605:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=36)]) -> (%616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)])
-            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), )] (%616:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) -> (%617:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)])
-            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), )] (%617:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) -> (%617:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)])
-            linalg.CPU.LinearOp <name="model.layers.5.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=235))] (%617:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=234)]) -> (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)])
-            cf.ReturnOp (%618:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=236)], %599:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=224)], %601:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=226)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.5.mlp <CPU> [using_qnn:true, symbol:model.layers.5.mlp] {
-        (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237)]) -> (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)]) {
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=238))] (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237)]) -> (%621:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)])
-            linalg.CPU.SiLUOp <name="model.layers.5.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), )] (%621:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=239)]) -> (%622:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)])
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=241))] (%620:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=237)]) -> (%623:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242)])
-            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), )] (%622:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)], %623:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=242)]) -> (%624:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)])
-            linalg.CPU.LinearOp <name="model.layers.5.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=243))] (%624:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=240)]) -> (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)])
-            cf.ReturnOp (%625:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.6 <CPU> [using_qnn:true, symbol:model.layers.6] {
-        (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.6.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), )] (%626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)]) -> (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)])
-            graph.CallGraphOp @model.layers.6.self_attn (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)])
-            linalg.CPU.AddOp <name="model.layers.6.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), )] (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)], %626:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=244)]) -> (%660:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), )] (%660:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)]) -> (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)])
-            graph.CallGraphOp @model.layers.6.mlp (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)]) -> (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)])
-            linalg.CPU.AddOp <name="model.layers.6.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), )] (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)], %660:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)]) -> (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)])
-            cf.ReturnOp (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.6.self_attn <CPU> [using_qnn:true, symbol:model.layers.6.self_attn] {
-        (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)]) {
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.q_proj">(%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)]) -> (%628:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=250)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=246))] (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)]) -> (%629:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=248))] (%627:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=245)]) -> (%630:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=250), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=250), )] (%628:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=250)]) -> (%628:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=250)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=250), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=250), )] (%628:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=250)]) -> (%631:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=250)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247), )] (%629:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247)]) -> (%629:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247), )] (%629:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247)]) -> (%632:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249), )] (%630:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249)]) -> (%630:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249), )] (%630:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249)]) -> (%633:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=250), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), )] (%631:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=250)]) -> (%634:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)])
-            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), )] (%632:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=247)]) -> (%635:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)])
-            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), )] (%634:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%636:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)])
-            linalg.CPU.RoPEOp <name="model.layers.6.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), )] (%635:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%637:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252), outputs_0:QuantSpec(Raw(type: Float16), uuid=253), )] (%637:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=252)]) -> (%638:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=253)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=253), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254), )] (%638:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=253)]) -> (%639:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254), )] (%639:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)]) -> (%640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249), outputs_0:QuantSpec(Raw(type: Float16), uuid=255), )] (%633:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=249)]) -> (%641:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=255)])
-            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=255), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256), )] (%641:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=255)]) -> (%642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)])
-            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%332:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)]) -> (%643:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
-            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%333:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)]) -> (%644:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
-            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), )] (%643:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%645:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
-            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), )] (%644:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%646:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)])
-            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%636:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=251)], %645:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%647:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
-            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), inputs_1:QuantSpec(Raw(type: Float32), uuid=258), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), )] (%647:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)], %648:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=258), constant:[0.088388346]]) -> (%649:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)])
-            linalg.CPU.ReduceMinOp <name="model.layers.6.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), )] (%649:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)]) -> (%650:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)])
-            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), inputs_1:QuantSpec(Raw(type: Int16), uuid=260), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), )] (%650:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)], %651:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=260), constant:[-20]]) -> (%652:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)])
-            linalg.CPU.EqualOp <name="model.layers.6.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=261), outputs_0:QuantSpec(Raw(type: UInt8), uuid=262), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %653:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=261), constant:[0]]) -> (%654:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=262)])
-            linalg.CPU.WhereOp <name="model.layers.6.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=262), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), )] (%654:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=262)], %649:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=257)], %652:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)]) -> (%655:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)])
-            linalg.CPU.SoftmaxOp <name="model.layers.6.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263), )] (%655:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=259)]) -> (%656:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263)])
-            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), )] (%656:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=263)], %646:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=37)]) -> (%657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
-            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), )] (%657:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)]) -> (%658:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
-            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), )] (%658:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)]) -> (%658:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)])
-            linalg.CPU.LinearOp <name="model.layers.6.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265))] (%658:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=264)]) -> (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)])
-            cf.ReturnOp (%659:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=266)], %640:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=254)], %642:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=256)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.6.mlp <CPU> [using_qnn:true, symbol:model.layers.6.mlp] {
-        (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)]) -> (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) {
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=268))] (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)]) -> (%662:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)])
-            linalg.CPU.SiLUOp <name="model.layers.6.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), )] (%662:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=269)]) -> (%663:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)])
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271))] (%661:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=267)]) -> (%664:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)])
-            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), )] (%663:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)], %664:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=272)]) -> (%665:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)])
-            linalg.CPU.LinearOp <name="model.layers.6.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=273))] (%665:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=270)]) -> (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)])
-            cf.ReturnOp (%666:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.7 <CPU> [using_qnn:true, symbol:model.layers.7] {
-        (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.7.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), )] (%667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) -> (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)])
-            graph.CallGraphOp @model.layers.7.self_attn (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)])
-            linalg.CPU.AddOp <name="model.layers.7.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), )] (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)], %667:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=274)]) -> (%701:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297), )] (%701:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)]) -> (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)])
-            graph.CallGraphOp @model.layers.7.mlp (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)]) -> (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)])
-            linalg.CPU.AddOp <name="model.layers.7.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), )] (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %701:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)]) -> (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)])
-            cf.ReturnOp (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.7.self_attn <CPU> [using_qnn:true, symbol:model.layers.7.self_attn] {
-        (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)]) {
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.q_proj">(%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%669:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=280)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=276))] (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%670:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=278))] (%668:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=275)]) -> (%671:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=280), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=280), )] (%669:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=280)]) -> (%669:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=280)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=280), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=280), )] (%669:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=280)]) -> (%672:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=280)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), )] (%670:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)]) -> (%670:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), )] (%670:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)]) -> (%673:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), )] (%671:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)]) -> (%671:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), )] (%671:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)]) -> (%674:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=280), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281), )] (%672:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=280)]) -> (%675:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281)])
-            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282), )] (%673:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=277)]) -> (%676:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282)])
-            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281), )] (%675:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%677:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281)])
-            linalg.CPU.RoPEOp <name="model.layers.7.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282), )] (%676:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%678:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282), outputs_0:QuantSpec(Raw(type: Float16), uuid=283), )] (%678:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=282)]) -> (%679:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=283)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=283), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), )] (%679:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=283)]) -> (%680:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), )] (%680:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> (%681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279), outputs_0:QuantSpec(Raw(type: Float16), uuid=285), )] (%674:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=279)]) -> (%682:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=285)])
-            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=285), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286), )] (%682:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=285)]) -> (%683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)])
-            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%334:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)]) -> (%684:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
-            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%335:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)]) -> (%685:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
-            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), )] (%684:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%686:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
-            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), )] (%685:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%687:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)])
-            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%677:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=281)], %686:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%688:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
-            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), inputs_1:QuantSpec(Raw(type: Float32), uuid=288), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), )] (%688:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)], %689:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=288), constant:[0.088388346]]) -> (%690:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)])
-            linalg.CPU.ReduceMinOp <name="model.layers.7.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289), )] (%690:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)]) -> (%691:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289)])
-            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289), inputs_1:QuantSpec(Raw(type: Int16), uuid=290), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289), )] (%691:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289)], %692:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=290), constant:[-20]]) -> (%693:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289)])
-            linalg.CPU.EqualOp <name="model.layers.7.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=291), outputs_0:QuantSpec(Raw(type: UInt8), uuid=292), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %694:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=291), constant:[0]]) -> (%695:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=292)])
-            linalg.CPU.WhereOp <name="model.layers.7.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=292), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289), )] (%695:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=292)], %690:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=287)], %693:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289)]) -> (%696:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289)])
-            linalg.CPU.SoftmaxOp <name="model.layers.7.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), )] (%696:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=289)]) -> (%697:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)])
-            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), )] (%697:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=293)], %687:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=38)]) -> (%698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
-            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), )] (%698:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%699:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
-            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), )] (%699:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%699:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)])
-            linalg.CPU.LinearOp <name="model.layers.7.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=295))] (%699:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=294)]) -> (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)])
-            cf.ReturnOp (%700:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=296)], %681:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=284)], %683:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=286)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.7.mlp <CPU> [using_qnn:true, symbol:model.layers.7.mlp] {
-        (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)]) -> (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) {
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=298))] (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)]) -> (%703:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)])
-            linalg.CPU.SiLUOp <name="model.layers.7.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300), )] (%703:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=299)]) -> (%704:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300)])
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=301))] (%702:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=297)]) -> (%705:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)])
-            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300), )] (%704:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300)], %705:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=302)]) -> (%706:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300)])
-            linalg.CPU.LinearOp <name="model.layers.7.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=303))] (%706:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=300)]) -> (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)])
-            cf.ReturnOp (%707:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.8 <CPU> [using_qnn:true, symbol:model.layers.8] {
-        (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.8.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305), )] (%708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305)])
-            graph.CallGraphOp @model.layers.8.self_attn (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
-            linalg.CPU.AddOp <name="model.layers.8.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), )] (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)], %708:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=304)]) -> (%742:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), )] (%742:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)])
-            graph.CallGraphOp @model.layers.8.mlp (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)]) -> (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)])
-            linalg.CPU.AddOp <name="model.layers.8.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), )] (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)], %742:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)]) -> (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)])
-            cf.ReturnOp (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.8.self_attn <CPU> [using_qnn:true, symbol:model.layers.8.self_attn] {
-        (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) {
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.q_proj">(%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305)]) -> (%710:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306))] (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305)]) -> (%711:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=308))] (%709:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=305)]) -> (%712:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), )] (%710:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%710:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), )] (%710:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%713:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), )] (%711:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%711:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), )] (%711:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%714:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), )] (%712:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%712:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), )] (%712:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%715:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=310), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), )] (%713:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=310)]) -> (%716:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)])
-            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), )] (%714:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=307)]) -> (%717:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)])
-            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), )] (%716:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%718:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)])
-            linalg.CPU.RoPEOp <name="model.layers.8.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), )] (%717:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%719:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312), outputs_0:QuantSpec(Raw(type: Float16), uuid=313), )] (%719:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=312)]) -> (%720:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=313)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=313), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314), )] (%720:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=313)]) -> (%721:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314), )] (%721:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)]) -> (%722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309), outputs_0:QuantSpec(Raw(type: Float16), uuid=315), )] (%715:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=309)]) -> (%723:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=315)])
-            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=315), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), )] (%723:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=315)]) -> (%724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)])
-            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%336:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)]) -> (%725:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
-            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%337:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> (%726:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
-            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), )] (%725:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%727:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
-            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), )] (%726:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%728:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)])
-            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317), )] (%718:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=311)], %727:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%729:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317)])
-            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317), inputs_1:QuantSpec(Raw(type: Float32), uuid=318), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317), )] (%729:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317)], %730:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=318), constant:[0.088388346]]) -> (%731:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317)])
-            linalg.CPU.ReduceMinOp <name="model.layers.8.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%731:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317)]) -> (%732:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
-            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), inputs_1:QuantSpec(Raw(type: Int16), uuid=320), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%732:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)], %733:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=320), constant:[-20]]) -> (%734:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
-            linalg.CPU.EqualOp <name="model.layers.8.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=321), outputs_0:QuantSpec(Raw(type: UInt8), uuid=322), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %735:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=321), constant:[1]]) -> (%736:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=322)])
-            linalg.CPU.WhereOp <name="model.layers.8.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=322), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), )] (%736:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=322)], %731:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=317)], %734:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)]) -> (%737:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)])
-            linalg.CPU.SoftmaxOp <name="model.layers.8.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=323), )] (%737:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=319)]) -> (%738:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=323)])
-            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=323), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), )] (%738:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=323)], %728:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=39)]) -> (%739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)])
-            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), )] (%739:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)]) -> (%740:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)])
-            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), )] (%740:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)]) -> (%740:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)])
-            linalg.CPU.LinearOp <name="model.layers.8.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=325))] (%740:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=324)]) -> (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)])
-            cf.ReturnOp (%741:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=326)], %722:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=314)], %724:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=316)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.8.mlp <CPU> [using_qnn:true, symbol:model.layers.8.mlp] {
-        (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)]) -> (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)]) {
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=328))] (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)]) -> (%744:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)])
-            linalg.CPU.SiLUOp <name="model.layers.8.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330), )] (%744:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=329)]) -> (%745:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330)])
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=331))] (%743:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=327)]) -> (%746:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)])
-            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330), )] (%745:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330)], %746:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=332)]) -> (%747:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330)])
-            linalg.CPU.LinearOp <name="model.layers.8.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=333))] (%747:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=330)]) -> (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)])
-            cf.ReturnOp (%748:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.9 <CPU> [using_qnn:true, symbol:model.layers.9] {
-        (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.9.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), )] (%749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)]) -> (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)])
-            graph.CallGraphOp @model.layers.9.self_attn (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)])
-            linalg.CPU.AddOp <name="model.layers.9.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356), )] (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)], %749:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=334)]) -> (%783:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357), )] (%783:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)]) -> (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357)])
-            graph.CallGraphOp @model.layers.9.mlp (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357)]) -> (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)])
-            linalg.CPU.AddOp <name="model.layers.9.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), )] (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)], %783:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)]) -> (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)])
-            cf.ReturnOp (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.9.self_attn <CPU> [using_qnn:true, symbol:model.layers.9.self_attn] {
-        (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)]) {
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.q_proj">(%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)]) -> (%751:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=340)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=336))] (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)]) -> (%752:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=338))] (%750:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=335)]) -> (%753:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=340), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=340), )] (%751:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=340)]) -> (%751:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=340)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=340), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=340), )] (%751:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=340)]) -> (%754:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=340)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), )] (%752:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%752:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), )] (%752:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%755:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), )] (%753:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339)]) -> (%753:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), )] (%753:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339)]) -> (%756:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=340), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), )] (%754:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=340)]) -> (%757:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
-            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), )] (%755:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=337)]) -> (%758:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)])
-            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), )] (%757:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%759:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)])
-            linalg.CPU.RoPEOp <name="model.layers.9.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), )] (%758:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%760:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342), outputs_0:QuantSpec(Raw(type: Float16), uuid=343), )] (%760:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=342)]) -> (%761:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=343)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=343), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344), )] (%761:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=343)]) -> (%762:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344), )] (%762:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)]) -> (%763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339), outputs_0:QuantSpec(Raw(type: Float16), uuid=345), )] (%756:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=339)]) -> (%764:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=345)])
-            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=345), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346), )] (%764:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=345)]) -> (%765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)])
-            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%338:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)]) -> (%766:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
-            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%339:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)]) -> (%767:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
-            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), )] (%766:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%768:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
-            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), )] (%767:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%769:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)])
-            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), )] (%759:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=341)], %768:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%770:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)])
-            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), inputs_1:QuantSpec(Raw(type: Float32), uuid=348), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), )] (%770:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)], %771:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=348), constant:[0.088388346]]) -> (%772:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)])
-            linalg.CPU.ReduceMinOp <name="model.layers.9.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349), )] (%772:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)]) -> (%773:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349)])
-            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349), inputs_1:QuantSpec(Raw(type: Int16), uuid=350), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349), )] (%773:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349)], %774:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=350), constant:[-20]]) -> (%775:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349)])
-            linalg.CPU.EqualOp <name="model.layers.9.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=351), outputs_0:QuantSpec(Raw(type: UInt8), uuid=352), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %776:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=351), constant:[-0.1796875]]) -> (%777:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=352)])
-            linalg.CPU.WhereOp <name="model.layers.9.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=352), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349), )] (%777:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=352)], %772:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=347)], %775:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349)]) -> (%778:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349)])
-            linalg.CPU.SoftmaxOp <name="model.layers.9.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), )] (%778:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=349)]) -> (%779:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)])
-            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), )] (%779:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=353)], %769:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=40)]) -> (%780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)])
-            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), )] (%780:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)]) -> (%781:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)])
-            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), )] (%781:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)]) -> (%781:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)])
-            linalg.CPU.LinearOp <name="model.layers.9.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=355))] (%781:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=354)]) -> (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)])
-            cf.ReturnOp (%782:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=356)], %763:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=344)], %765:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=346)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.9.mlp <CPU> [using_qnn:true, symbol:model.layers.9.mlp] {
-        (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357)]) -> (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)]) {
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=358))] (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357)]) -> (%785:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)])
-            linalg.CPU.SiLUOp <name="model.layers.9.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%785:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=359)]) -> (%786:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=361))] (%784:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=357)]) -> (%787:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)])
-            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), )] (%786:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)], %787:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=362)]) -> (%788:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)])
-            linalg.CPU.LinearOp <name="model.layers.9.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=363))] (%788:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=360)]) -> (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)])
-            cf.ReturnOp (%789:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.10 <CPU> [using_qnn:true, symbol:model.layers.10] {
-        (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.10.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365), )] (%790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)]) -> (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)])
-            graph.CallGraphOp @model.layers.10.self_attn (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)])
-            linalg.CPU.AddOp <name="model.layers.10.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), )] (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)], %790:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=364)]) -> (%824:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), )] (%824:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)]) -> (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)])
-            graph.CallGraphOp @model.layers.10.mlp (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)]) -> (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
-            linalg.CPU.AddOp <name="model.layers.10.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), )] (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)], %824:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)]) -> (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
-            cf.ReturnOp (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.10.self_attn <CPU> [using_qnn:true, symbol:model.layers.10.self_attn] {
-        (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)]) {
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.q_proj">(%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)]) -> (%792:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=370)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=366))] (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)]) -> (%793:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=368))] (%791:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=365)]) -> (%794:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=370), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=370), )] (%792:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=370)]) -> (%792:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=370)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=370), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=370), )] (%792:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=370)]) -> (%795:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=370)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), )] (%793:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)]) -> (%793:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), )] (%793:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)]) -> (%796:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), )] (%794:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)]) -> (%794:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), )] (%794:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)]) -> (%797:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=370), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), )] (%795:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=370)]) -> (%798:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
-            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), )] (%796:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=367)]) -> (%799:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)])
-            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), )] (%798:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%800:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)])
-            linalg.CPU.RoPEOp <name="model.layers.10.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), )] (%799:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%801:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372), outputs_0:QuantSpec(Raw(type: Float16), uuid=373), )] (%801:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=372)]) -> (%802:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=373)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=373), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374), )] (%802:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=373)]) -> (%803:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374), )] (%803:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)]) -> (%804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369), outputs_0:QuantSpec(Raw(type: Float16), uuid=375), )] (%797:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=369)]) -> (%805:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=375)])
-            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=375), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376), )] (%805:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=375)]) -> (%806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)])
-            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%340:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)]) -> (%807:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
-            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%341:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)]) -> (%808:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
-            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), )] (%807:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%809:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
-            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), )] (%808:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%810:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)])
-            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), )] (%800:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=371)], %809:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%811:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
-            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), inputs_1:QuantSpec(Raw(type: Float32), uuid=378), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), )] (%811:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)], %812:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=378), constant:[0.088388346]]) -> (%813:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)])
-            linalg.CPU.ReduceMinOp <name="model.layers.10.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), )] (%813:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)]) -> (%814:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
-            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), inputs_1:QuantSpec(Raw(type: Int16), uuid=380), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), )] (%814:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)], %815:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=380), constant:[-20]]) -> (%816:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
-            linalg.CPU.EqualOp <name="model.layers.10.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=381), outputs_0:QuantSpec(Raw(type: UInt8), uuid=382), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %817:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=381), constant:[-0.93359375]]) -> (%818:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=382)])
-            linalg.CPU.WhereOp <name="model.layers.10.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=382), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), )] (%818:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=382)], %813:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=377)], %816:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)]) -> (%819:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)])
-            linalg.CPU.SoftmaxOp <name="model.layers.10.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=383), )] (%819:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=379)]) -> (%820:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=383)])
-            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=383), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384), )] (%820:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=383)], %810:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=41)]) -> (%821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384)])
-            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384), )] (%821:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384)]) -> (%822:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384)])
-            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384), )] (%822:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384)]) -> (%822:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384)])
-            linalg.CPU.LinearOp <name="model.layers.10.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=385))] (%822:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=384)]) -> (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)])
-            cf.ReturnOp (%823:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=386)], %804:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=374)], %806:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=376)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.10.mlp <CPU> [using_qnn:true, symbol:model.layers.10.mlp] {
-        (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)]) -> (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) {
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=388))] (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)]) -> (%826:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)])
-            linalg.CPU.SiLUOp <name="model.layers.10.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390), )] (%826:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=389)]) -> (%827:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390)])
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=391))] (%825:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=387)]) -> (%828:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392)])
-            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390), )] (%827:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390)], %828:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=392)]) -> (%829:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390)])
-            linalg.CPU.LinearOp <name="model.layers.10.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=393))] (%829:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=390)]) -> (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)])
-            cf.ReturnOp (%830:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.11 <CPU> [using_qnn:true, symbol:model.layers.11] {
-        (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.11.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), )] (%831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)])
-            graph.CallGraphOp @model.layers.11.self_attn (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)])
-            linalg.CPU.AddOp <name="model.layers.11.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), )] (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)], %831:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=394)]) -> (%865:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417), )] (%865:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)]) -> (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417)])
-            graph.CallGraphOp @model.layers.11.mlp (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417)]) -> (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)])
-            linalg.CPU.AddOp <name="model.layers.11.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424), )] (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)], %865:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)]) -> (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)])
-            cf.ReturnOp (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.11.self_attn <CPU> [using_qnn:true, symbol:model.layers.11.self_attn] {
-        (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)]) {
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.q_proj">(%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)]) -> (%833:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=400)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=396))] (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)]) -> (%834:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=398))] (%832:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=395)]) -> (%835:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=400), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=400), )] (%833:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=400)]) -> (%833:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=400)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=400), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=400), )] (%833:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=400)]) -> (%836:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=400)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), )] (%834:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%834:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), )] (%834:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%837:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), )] (%835:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)]) -> (%835:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), )] (%835:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)]) -> (%838:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=400), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), )] (%836:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=400)]) -> (%839:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)])
-            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402), )] (%837:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=397)]) -> (%840:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402)])
-            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), )] (%839:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%841:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)])
-            linalg.CPU.RoPEOp <name="model.layers.11.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402), )] (%840:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%842:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402), outputs_0:QuantSpec(Raw(type: Float16), uuid=403), )] (%842:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=402)]) -> (%843:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=403)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=403), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404), )] (%843:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=403)]) -> (%844:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404), )] (%844:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)]) -> (%845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399), outputs_0:QuantSpec(Raw(type: Float16), uuid=405), )] (%838:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=399)]) -> (%846:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=405)])
-            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=405), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406), )] (%846:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=405)]) -> (%847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)])
-            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%342:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)]) -> (%848:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
-            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%343:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)]) -> (%849:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
-            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), )] (%848:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%850:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
-            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), )] (%849:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%851:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)])
-            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), )] (%841:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=401)], %850:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%852:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407)])
-            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), inputs_1:QuantSpec(Raw(type: Float32), uuid=408), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), )] (%852:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407)], %853:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=408), constant:[0.088388346]]) -> (%854:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407)])
-            linalg.CPU.ReduceMinOp <name="model.layers.11.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%854:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407)]) -> (%855:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
-            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), inputs_1:QuantSpec(Raw(type: Int16), uuid=410), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%855:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)], %856:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=410), constant:[-20]]) -> (%857:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
-            linalg.CPU.EqualOp <name="model.layers.11.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=411), outputs_0:QuantSpec(Raw(type: UInt8), uuid=412), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %858:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=411), constant:[0.515625]]) -> (%859:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=412)])
-            linalg.CPU.WhereOp <name="model.layers.11.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=412), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), )] (%859:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=412)], %854:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=407)], %857:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%860:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)])
-            linalg.CPU.SoftmaxOp <name="model.layers.11.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), )] (%860:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=409)]) -> (%861:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)])
-            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), )] (%861:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=413)], %851:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=42)]) -> (%862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)])
-            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), )] (%862:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)]) -> (%863:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)])
-            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), )] (%863:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)]) -> (%863:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)])
-            linalg.CPU.LinearOp <name="model.layers.11.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=415))] (%863:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=414)]) -> (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)])
-            cf.ReturnOp (%864:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=416)], %845:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=404)], %847:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=406)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.11.mlp <CPU> [using_qnn:true, symbol:model.layers.11.mlp] {
-        (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417)]) -> (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)]) {
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=419), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=418))] (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417)]) -> (%867:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=419)])
-            linalg.CPU.SiLUOp <name="model.layers.11.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=419), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), )] (%867:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=419)]) -> (%868:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)])
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=421))] (%866:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=417)]) -> (%869:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)])
-            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), )] (%868:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)], %869:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=422)]) -> (%870:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)])
-            linalg.CPU.LinearOp <name="model.layers.11.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=423))] (%870:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=420)]) -> (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)])
-            cf.ReturnOp (%871:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.12 <CPU> [using_qnn:true, symbol:model.layers.12] {
-        (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.12.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425), )] (%872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)]) -> (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425)])
-            graph.CallGraphOp @model.layers.12.self_attn (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)])
-            linalg.CPU.AddOp <name="model.layers.12.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), )] (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)], %872:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=424)]) -> (%906:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), )] (%906:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)]) -> (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)])
-            graph.CallGraphOp @model.layers.12.mlp (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)]) -> (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)])
-            linalg.CPU.AddOp <name="model.layers.12.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), )] (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %906:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)]) -> (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)])
-            cf.ReturnOp (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.12.self_attn <CPU> [using_qnn:true, symbol:model.layers.12.self_attn] {
-        (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)]) {
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.q_proj">(%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425)]) -> (%874:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=430)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=426))] (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425)]) -> (%875:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=428))] (%873:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=425)]) -> (%876:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=430), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=430), )] (%874:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=430)]) -> (%874:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=430)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=430), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=430), )] (%874:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=430)]) -> (%877:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=430)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%875:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)]) -> (%875:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), )] (%875:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)]) -> (%878:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), )] (%876:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)]) -> (%876:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), )] (%876:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)]) -> (%879:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=430), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), )] (%877:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=430)]) -> (%880:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)])
-            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432), )] (%878:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=427)]) -> (%881:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432)])
-            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), )] (%880:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%882:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)])
-            linalg.CPU.RoPEOp <name="model.layers.12.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432), )] (%881:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%883:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432), outputs_0:QuantSpec(Raw(type: Float16), uuid=433), )] (%883:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=432)]) -> (%884:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=433)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=433), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434), )] (%884:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=433)]) -> (%885:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434), )] (%885:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)]) -> (%886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429), outputs_0:QuantSpec(Raw(type: Float16), uuid=435), )] (%879:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=429)]) -> (%887:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=435)])
-            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=435), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436), )] (%887:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=435)]) -> (%888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)])
-            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%344:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)]) -> (%889:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
-            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%345:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)]) -> (%890:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
-            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), )] (%889:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%891:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
-            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), )] (%890:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%892:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)])
-            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), )] (%882:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=431)], %891:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%893:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)])
-            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), inputs_1:QuantSpec(Raw(type: Float32), uuid=438), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), )] (%893:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)], %894:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=438), constant:[0.088388346]]) -> (%895:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)])
-            linalg.CPU.ReduceMinOp <name="model.layers.12.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), )] (%895:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)]) -> (%896:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), inputs_1:QuantSpec(Raw(type: Int16), uuid=440), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), )] (%896:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)], %897:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=440), constant:[-20]]) -> (%898:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            linalg.CPU.EqualOp <name="model.layers.12.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=441), outputs_0:QuantSpec(Raw(type: UInt8), uuid=442), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %899:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=441), constant:[0.74609375]]) -> (%900:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=442)])
-            linalg.CPU.WhereOp <name="model.layers.12.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=442), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), )] (%900:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=442)], %895:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=437)], %898:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%901:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)])
-            linalg.CPU.SoftmaxOp <name="model.layers.12.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), )] (%901:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=439)]) -> (%902:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)])
-            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), )] (%902:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=443)], %892:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=43)]) -> (%903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)])
-            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), )] (%903:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)]) -> (%904:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)])
-            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), )] (%904:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)]) -> (%904:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)])
-            linalg.CPU.LinearOp <name="model.layers.12.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=445))] (%904:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=444)]) -> (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)])
-            cf.ReturnOp (%905:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=446)], %886:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=434)], %888:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=436)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.12.mlp <CPU> [using_qnn:true, symbol:model.layers.12.mlp] {
-        (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)]) -> (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)]) {
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=448))] (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)]) -> (%908:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)])
-            linalg.CPU.SiLUOp <name="model.layers.12.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450), )] (%908:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=449)]) -> (%909:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450)])
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=452), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=451))] (%907:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=447)]) -> (%910:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=452)])
-            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=452), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450), )] (%909:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450)], %910:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=452)]) -> (%911:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450)])
-            linalg.CPU.LinearOp <name="model.layers.12.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=453))] (%911:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=450)]) -> (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)])
-            cf.ReturnOp (%912:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.13 <CPU> [using_qnn:true, symbol:model.layers.13] {
-        (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.13.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), )] (%913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)]) -> (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)])
-            graph.CallGraphOp @model.layers.13.self_attn (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)])
-            linalg.CPU.AddOp <name="model.layers.13.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), )] (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)], %913:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=454)]) -> (%947:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), )] (%947:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)]) -> (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)])
-            graph.CallGraphOp @model.layers.13.mlp (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)])
-            linalg.CPU.AddOp <name="model.layers.13.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), )] (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)], %947:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)]) -> (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)])
-            cf.ReturnOp (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.13.self_attn <CPU> [using_qnn:true, symbol:model.layers.13.self_attn] {
-        (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)]) {
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.q_proj">(%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)]) -> (%915:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=460)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=456))] (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)]) -> (%916:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=458))] (%914:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=455)]) -> (%917:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=460), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=460), )] (%915:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=460)]) -> (%915:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=460)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=460), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=460), )] (%915:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=460)]) -> (%918:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=460)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%916:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%916:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), )] (%916:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%919:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459), )] (%917:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459)]) -> (%917:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459), )] (%917:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459)]) -> (%920:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=460), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%918:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=460)]) -> (%921:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
-            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%919:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=457)]) -> (%922:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
-            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), )] (%921:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%923:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)])
-            linalg.CPU.RoPEOp <name="model.layers.13.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), )] (%922:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%924:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462), outputs_0:QuantSpec(Raw(type: Float16), uuid=463), )] (%924:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=462)]) -> (%925:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=463)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=463), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464), )] (%925:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=463)]) -> (%926:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464), )] (%926:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)]) -> (%927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459), outputs_0:QuantSpec(Raw(type: Float16), uuid=465), )] (%920:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=459)]) -> (%928:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=465)])
-            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=465), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466), )] (%928:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=465)]) -> (%929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)])
-            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%346:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)]) -> (%930:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
-            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%347:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)]) -> (%931:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
-            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), )] (%930:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%932:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
-            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), )] (%931:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%933:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)])
-            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), )] (%923:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=461)], %932:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%934:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)])
-            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), inputs_1:QuantSpec(Raw(type: Float32), uuid=468), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), )] (%934:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)], %935:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=468), constant:[0.088388346]]) -> (%936:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)])
-            linalg.CPU.ReduceMinOp <name="model.layers.13.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%936:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)]) -> (%937:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
-            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), inputs_1:QuantSpec(Raw(type: Int16), uuid=470), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%937:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)], %938:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=470), constant:[-20]]) -> (%939:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
-            linalg.CPU.EqualOp <name="model.layers.13.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=471), outputs_0:QuantSpec(Raw(type: UInt8), uuid=472), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %940:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=471), constant:[-0.78515625]]) -> (%941:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=472)])
-            linalg.CPU.WhereOp <name="model.layers.13.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=472), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), )] (%941:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=472)], %936:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=467)], %939:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)]) -> (%942:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)])
-            linalg.CPU.SoftmaxOp <name="model.layers.13.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), )] (%942:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=469)]) -> (%943:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)])
-            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), )] (%943:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=473)], %933:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=44)]) -> (%944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)])
-            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), )] (%944:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%945:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)])
-            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), )] (%945:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%945:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)])
-            linalg.CPU.LinearOp <name="model.layers.13.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475))] (%945:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=474)]) -> (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)])
-            cf.ReturnOp (%946:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=476)], %927:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=464)], %929:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=466)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.13.mlp <CPU> [using_qnn:true, symbol:model.layers.13.mlp] {
-        (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)]) {
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=478))] (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%949:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)])
-            linalg.CPU.SiLUOp <name="model.layers.13.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), )] (%949:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=479)]) -> (%950:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)])
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=481))] (%948:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=477)]) -> (%951:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482)])
-            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), )] (%950:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)], %951:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=482)]) -> (%952:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)])
-            linalg.CPU.LinearOp <name="model.layers.13.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=483))] (%952:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=480)]) -> (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)])
-            cf.ReturnOp (%953:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.14 <CPU> [using_qnn:true, symbol:model.layers.14] {
-        (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.14.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485), )] (%954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)]) -> (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485)])
-            graph.CallGraphOp @model.layers.14.self_attn (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)])
-            linalg.CPU.AddOp <name="model.layers.14.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), )] (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %954:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=484)]) -> (%988:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), )] (%988:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)]) -> (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)])
-            graph.CallGraphOp @model.layers.14.mlp (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)])
-            linalg.CPU.AddOp <name="model.layers.14.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), )] (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %988:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)]) -> (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)])
-            cf.ReturnOp (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.14.self_attn <CPU> [using_qnn:true, symbol:model.layers.14.self_attn] {
-        (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)]) {
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.q_proj">(%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485)]) -> (%956:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=490)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=486))] (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485)]) -> (%957:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=488))] (%955:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=485)]) -> (%958:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=490), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=490), )] (%956:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=490)]) -> (%956:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=490)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=490), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=490), )] (%956:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=490)]) -> (%959:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=490)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487), )] (%957:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487)]) -> (%957:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487), )] (%957:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487)]) -> (%960:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), )] (%958:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)]) -> (%958:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), )] (%958:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)]) -> (%961:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=490), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%959:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=490)]) -> (%962:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
-            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492), )] (%960:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=487)]) -> (%963:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492)])
-            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), )] (%962:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%964:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)])
-            linalg.CPU.RoPEOp <name="model.layers.14.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492), )] (%963:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%965:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492), outputs_0:QuantSpec(Raw(type: Float16), uuid=493), )] (%965:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=492)]) -> (%966:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=493)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=493), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494), )] (%966:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=493)]) -> (%967:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494), )] (%967:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)]) -> (%968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489), outputs_0:QuantSpec(Raw(type: Float16), uuid=495), )] (%961:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=489)]) -> (%969:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=495)])
-            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=495), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496), )] (%969:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=495)]) -> (%970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)])
-            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%348:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)]) -> (%971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
-            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%349:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)]) -> (%972:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
-            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), )] (%971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
-            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), )] (%972:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%974:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)])
-            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), )] (%964:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=491)], %973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%975:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)])
-            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), inputs_1:QuantSpec(Raw(type: Float32), uuid=498), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), )] (%975:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)], %976:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=498), constant:[0.088388346]]) -> (%977:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)])
-            linalg.CPU.ReduceMinOp <name="model.layers.14.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), )] (%977:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)]) -> (%978:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)])
-            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), inputs_1:QuantSpec(Raw(type: Int16), uuid=500), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), )] (%978:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)], %979:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=500), constant:[-20]]) -> (%980:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)])
-            linalg.CPU.EqualOp <name="model.layers.14.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=501), outputs_0:QuantSpec(Raw(type: UInt8), uuid=502), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %981:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=501), constant:[-0.46289062]]) -> (%982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=502)])
-            linalg.CPU.WhereOp <name="model.layers.14.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=502), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), )] (%982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=502)], %977:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=497)], %980:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%983:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)])
-            linalg.CPU.SoftmaxOp <name="model.layers.14.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), )] (%983:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=499)]) -> (%984:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)])
-            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), )] (%984:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=503)], %974:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=45)]) -> (%985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)])
-            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), )] (%985:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)]) -> (%986:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)])
-            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), )] (%986:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)]) -> (%986:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)])
-            linalg.CPU.LinearOp <name="model.layers.14.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=505))] (%986:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=504)]) -> (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)])
-            cf.ReturnOp (%987:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=506)], %968:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=494)], %970:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=496)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.14.mlp <CPU> [using_qnn:true, symbol:model.layers.14.mlp] {
-        (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)]) {
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=508))] (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%990:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509)])
-            linalg.CPU.SiLUOp <name="model.layers.14.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), )] (%990:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=509)]) -> (%991:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)])
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=511))] (%989:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=507)]) -> (%992:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)])
-            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), )] (%991:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)], %992:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=512)]) -> (%993:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)])
-            linalg.CPU.LinearOp <name="model.layers.14.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=513))] (%993:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=510)]) -> (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)])
-            cf.ReturnOp (%994:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.15 <CPU> [using_qnn:true, symbol:model.layers.15] {
-        (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.15.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), )] (%995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)]) -> (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)])
-            graph.CallGraphOp @model.layers.15.self_attn (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)])
-            linalg.CPU.AddOp <name="model.layers.15.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), )] (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)], %995:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=514)]) -> (%1029:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), )] (%1029:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)]) -> (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)])
-            graph.CallGraphOp @model.layers.15.mlp (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)]) -> (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)])
-            linalg.CPU.AddOp <name="model.layers.15.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), )] (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)], %1029:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)]) -> (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)])
-            cf.ReturnOp (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.15.self_attn <CPU> [using_qnn:true, symbol:model.layers.15.self_attn] {
-        (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)]) {
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.q_proj">(%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)]) -> (%997:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=520)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=516))] (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)]) -> (%998:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=518))] (%996:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=515)]) -> (%999:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=520), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=520), )] (%997:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=520)]) -> (%997:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=520)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=520), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=520), )] (%997:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=520)]) -> (%1000:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=520)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), )] (%998:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)]) -> (%998:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), )] (%998:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)]) -> (%1001:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519), )] (%999:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519)]) -> (%999:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519), )] (%999:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519)]) -> (%1002:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=520), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521), )] (%1000:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=520)]) -> (%1003:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521)])
-            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), )] (%1001:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=517)]) -> (%1004:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)])
-            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521), )] (%1003:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1005:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521)])
-            linalg.CPU.RoPEOp <name="model.layers.15.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), )] (%1004:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1006:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522), outputs_0:QuantSpec(Raw(type: Float16), uuid=523), )] (%1006:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=522)]) -> (%1007:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=523)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=523), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524), )] (%1007:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=523)]) -> (%1008:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524), )] (%1008:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)]) -> (%1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519), outputs_0:QuantSpec(Raw(type: Float16), uuid=525), )] (%1002:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=519)]) -> (%1010:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=525)])
-            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=525), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526), )] (%1010:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=525)]) -> (%1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)])
-            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%350:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)]) -> (%1012:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
-            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%351:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)]) -> (%1013:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
-            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), )] (%1012:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%1014:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
-            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), )] (%1013:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1015:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)])
-            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527), )] (%1005:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=521)], %1014:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%1016:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527)])
-            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527), inputs_1:QuantSpec(Raw(type: Float32), uuid=528), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527), )] (%1016:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527)], %1017:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=528), constant:[0.088388346]]) -> (%1018:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527)])
-            linalg.CPU.ReduceMinOp <name="model.layers.15.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%1018:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527)]) -> (%1019:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), inputs_1:QuantSpec(Raw(type: Int16), uuid=530), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%1019:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)], %1020:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=530), constant:[-20]]) -> (%1021:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.EqualOp <name="model.layers.15.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=531), outputs_0:QuantSpec(Raw(type: UInt8), uuid=532), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1022:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=531), constant:[0.953125]]) -> (%1023:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=532)])
-            linalg.CPU.WhereOp <name="model.layers.15.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=532), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), )] (%1023:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=532)], %1018:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=527)], %1021:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)]) -> (%1024:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)])
-            linalg.CPU.SoftmaxOp <name="model.layers.15.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), )] (%1024:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=529)]) -> (%1025:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)])
-            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), )] (%1025:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=533)], %1015:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=46)]) -> (%1026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534)])
-            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), )] (%1026:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534)]) -> (%1027:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534)])
-            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), )] (%1027:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534)]) -> (%1027:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534)])
-            linalg.CPU.LinearOp <name="model.layers.15.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=535))] (%1027:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=534)]) -> (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)])
-            cf.ReturnOp (%1028:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=536)], %1009:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=524)], %1011:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=526)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.15.mlp <CPU> [using_qnn:true, symbol:model.layers.15.mlp] {
-        (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)]) -> (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) {
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=538))] (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)]) -> (%1031:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539)])
-            linalg.CPU.SiLUOp <name="model.layers.15.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), )] (%1031:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=539)]) -> (%1032:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)])
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=541))] (%1030:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=537)]) -> (%1033:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)])
-            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), )] (%1032:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)], %1033:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=542)]) -> (%1034:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)])
-            linalg.CPU.LinearOp <name="model.layers.15.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=543))] (%1034:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=540)]) -> (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)])
-            cf.ReturnOp (%1035:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.16 <CPU> [using_qnn:true, symbol:model.layers.16] {
-        (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.16.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), )] (%1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) -> (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)])
-            graph.CallGraphOp @model.layers.16.self_attn (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
-            linalg.CPU.AddOp <name="model.layers.16.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), )] (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %1036:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=544)]) -> (%1070:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), )] (%1070:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)])
-            graph.CallGraphOp @model.layers.16.mlp (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)])
-            linalg.CPU.AddOp <name="model.layers.16.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), )] (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %1070:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)]) -> (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)])
-            cf.ReturnOp (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.16.self_attn <CPU> [using_qnn:true, symbol:model.layers.16.self_attn] {
-        (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) {
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.q_proj">(%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%1038:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=546))] (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%1039:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=548))] (%1037:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=545)]) -> (%1040:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=550), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=550), )] (%1038:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550)]) -> (%1038:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=550), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=550), )] (%1038:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550)]) -> (%1041:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), )] (%1039:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%1039:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), )] (%1039:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%1042:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), )] (%1040:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)]) -> (%1040:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), )] (%1040:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)]) -> (%1043:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=550), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), )] (%1041:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=550)]) -> (%1044:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)])
-            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552), )] (%1042:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=547)]) -> (%1045:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552)])
-            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), )] (%1044:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1046:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)])
-            linalg.CPU.RoPEOp <name="model.layers.16.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552), )] (%1045:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1047:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552), outputs_0:QuantSpec(Raw(type: Float16), uuid=553), )] (%1047:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=552)]) -> (%1048:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=553)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=553), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), )] (%1048:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=553)]) -> (%1049:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), )] (%1049:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)]) -> (%1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549), outputs_0:QuantSpec(Raw(type: Float16), uuid=555), )] (%1043:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=549)]) -> (%1051:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=555)])
-            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=555), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556), )] (%1051:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=555)]) -> (%1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)])
-            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%352:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)]) -> (%1053:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
-            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%353:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> (%1054:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
-            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), )] (%1053:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%1055:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
-            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), )] (%1054:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1056:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)])
-            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), )] (%1046:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=551)], %1055:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%1057:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)])
-            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), inputs_1:QuantSpec(Raw(type: Float32), uuid=558), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), )] (%1057:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)], %1058:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=558), constant:[0.088388346]]) -> (%1059:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)])
-            linalg.CPU.ReduceMinOp <name="model.layers.16.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%1059:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)]) -> (%1060:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
-            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), inputs_1:QuantSpec(Raw(type: Int16), uuid=560), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%1060:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)], %1061:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=560), constant:[-20]]) -> (%1062:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
-            linalg.CPU.EqualOp <name="model.layers.16.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=561), outputs_0:QuantSpec(Raw(type: UInt8), uuid=562), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1063:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=561), constant:[0.118652344]]) -> (%1064:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=562)])
-            linalg.CPU.WhereOp <name="model.layers.16.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=562), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), )] (%1064:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=562)], %1059:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=557)], %1062:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)]) -> (%1065:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)])
-            linalg.CPU.SoftmaxOp <name="model.layers.16.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), )] (%1065:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=559)]) -> (%1066:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)])
-            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%1066:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=563)], %1056:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=47)]) -> (%1067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
-            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%1067:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%1068:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
-            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), )] (%1068:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%1068:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)])
-            linalg.CPU.LinearOp <name="model.layers.16.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=565))] (%1068:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=564)]) -> (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)])
-            cf.ReturnOp (%1069:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=566)], %1050:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=554)], %1052:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=556)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.16.mlp <CPU> [using_qnn:true, symbol:model.layers.16.mlp] {
-        (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) {
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=568))] (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%1072:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569)])
-            linalg.CPU.SiLUOp <name="model.layers.16.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), )] (%1072:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=569)]) -> (%1073:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)])
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=571))] (%1071:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=567)]) -> (%1074:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572)])
-            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), )] (%1073:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)], %1074:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=572)]) -> (%1075:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)])
-            linalg.CPU.LinearOp <name="model.layers.16.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=573))] (%1075:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=570)]) -> (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)])
-            cf.ReturnOp (%1076:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.17 <CPU> [using_qnn:true, symbol:model.layers.17] {
-        (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.17.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), )] (%1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) -> (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)])
-            graph.CallGraphOp @model.layers.17.self_attn (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)])
-            linalg.CPU.AddOp <name="model.layers.17.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), )] (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)], %1077:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=574)]) -> (%1111:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), )] (%1111:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)]) -> (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)])
-            graph.CallGraphOp @model.layers.17.mlp (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
-            linalg.CPU.AddOp <name="model.layers.17.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), )] (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %1111:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)]) -> (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
-            cf.ReturnOp (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.17.self_attn <CPU> [using_qnn:true, symbol:model.layers.17.self_attn] {
-        (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)]) {
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.q_proj">(%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%1079:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=580)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=576))] (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%1080:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=578))] (%1078:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=575)]) -> (%1081:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=580), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=580), )] (%1079:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=580)]) -> (%1079:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=580)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=580), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=580), )] (%1079:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=580)]) -> (%1082:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=580)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), )] (%1080:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577)]) -> (%1080:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), )] (%1080:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577)]) -> (%1083:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), )] (%1081:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%1081:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), )] (%1081:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%1084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=580), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), )] (%1082:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=580)]) -> (%1085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
-            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), )] (%1083:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=577)]) -> (%1086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)])
-            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), )] (%1085:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1087:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)])
-            linalg.CPU.RoPEOp <name="model.layers.17.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), )] (%1086:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1088:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582), outputs_0:QuantSpec(Raw(type: Float16), uuid=583), )] (%1088:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=582)]) -> (%1089:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=583)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=583), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584), )] (%1089:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=583)]) -> (%1090:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584), )] (%1090:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)]) -> (%1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579), outputs_0:QuantSpec(Raw(type: Float16), uuid=585), )] (%1084:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=579)]) -> (%1092:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=585)])
-            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=585), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586), )] (%1092:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=585)]) -> (%1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)])
-            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%354:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)]) -> (%1094:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
-            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%355:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)]) -> (%1095:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
-            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), )] (%1094:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%1096:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
-            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), )] (%1095:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1097:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)])
-            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587), )] (%1087:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=581)], %1096:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%1098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587)])
-            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587), inputs_1:QuantSpec(Raw(type: Float32), uuid=588), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587), )] (%1098:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587)], %1099:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=588), constant:[0.088388346]]) -> (%1100:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587)])
-            linalg.CPU.ReduceMinOp <name="model.layers.17.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589), )] (%1100:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587)]) -> (%1101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589)])
-            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589), inputs_1:QuantSpec(Raw(type: Int16), uuid=590), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589), )] (%1101:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589)], %1102:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=590), constant:[-20]]) -> (%1103:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589)])
-            linalg.CPU.EqualOp <name="model.layers.17.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=591), outputs_0:QuantSpec(Raw(type: UInt8), uuid=592), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1104:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=591), constant:[-0.99609375]]) -> (%1105:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=592)])
-            linalg.CPU.WhereOp <name="model.layers.17.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=592), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589), )] (%1105:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=592)], %1100:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=587)], %1103:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589)]) -> (%1106:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589)])
-            linalg.CPU.SoftmaxOp <name="model.layers.17.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), )] (%1106:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=589)]) -> (%1107:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)])
-            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594), )] (%1107:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=593)], %1097:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=48)]) -> (%1108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594)])
-            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594), )] (%1108:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594)]) -> (%1109:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594)])
-            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594), )] (%1109:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594)]) -> (%1109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594)])
-            linalg.CPU.LinearOp <name="model.layers.17.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=595))] (%1109:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=594)]) -> (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)])
-            cf.ReturnOp (%1110:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=596)], %1091:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=584)], %1093:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=586)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.17.mlp <CPU> [using_qnn:true, symbol:model.layers.17.mlp] {
-        (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) {
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=598))] (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)])
-            linalg.CPU.SiLUOp <name="model.layers.17.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), )] (%1113:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=599)]) -> (%1114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=601))] (%1112:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=597)]) -> (%1115:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602)])
-            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), )] (%1114:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)], %1115:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=602)]) -> (%1116:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)])
-            linalg.CPU.LinearOp <name="model.layers.17.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=603))] (%1116:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=600)]) -> (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)])
-            cf.ReturnOp (%1117:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.18 <CPU> [using_qnn:true, symbol:model.layers.18] {
-        (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.18.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), )] (%1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) -> (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)])
-            graph.CallGraphOp @model.layers.18.self_attn (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)])
-            linalg.CPU.AddOp <name="model.layers.18.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), )] (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)], %1118:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=604)]) -> (%1152:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), )] (%1152:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)]) -> (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)])
-            graph.CallGraphOp @model.layers.18.mlp (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
-            linalg.CPU.AddOp <name="model.layers.18.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), )] (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %1152:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)]) -> (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
-            cf.ReturnOp (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.18.self_attn <CPU> [using_qnn:true, symbol:model.layers.18.self_attn] {
-        (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)]) {
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.q_proj">(%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)]) -> (%1120:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=610)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=606))] (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)]) -> (%1121:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=608))] (%1119:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=605)]) -> (%1122:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=610), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=610), )] (%1120:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=610)]) -> (%1120:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=610)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=610), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=610), )] (%1120:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=610)]) -> (%1123:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=610)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), )] (%1121:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)]) -> (%1121:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), )] (%1121:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)]) -> (%1124:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), )] (%1122:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%1122:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), )] (%1122:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%1125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=610), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611), )] (%1123:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=610)]) -> (%1126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611)])
-            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), )] (%1124:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=607)]) -> (%1127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)])
-            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611), )] (%1126:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1128:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611)])
-            linalg.CPU.RoPEOp <name="model.layers.18.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), )] (%1127:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1129:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612), outputs_0:QuantSpec(Raw(type: Float16), uuid=613), )] (%1129:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=612)]) -> (%1130:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=613)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=613), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614), )] (%1130:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=613)]) -> (%1131:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614), )] (%1131:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)]) -> (%1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609), outputs_0:QuantSpec(Raw(type: Float16), uuid=615), )] (%1125:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=609)]) -> (%1133:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=615)])
-            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=615), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616), )] (%1133:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=615)]) -> (%1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)])
-            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%356:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)]) -> (%1135:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
-            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%357:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)]) -> (%1136:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
-            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), )] (%1135:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%1137:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
-            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), )] (%1136:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1138:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)])
-            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), )] (%1128:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=611)], %1137:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%1139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)])
-            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), inputs_1:QuantSpec(Raw(type: Float32), uuid=618), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), )] (%1139:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)], %1140:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=618), constant:[0.088388346]]) -> (%1141:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)])
-            linalg.CPU.ReduceMinOp <name="model.layers.18.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), )] (%1141:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)]) -> (%1142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
-            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), inputs_1:QuantSpec(Raw(type: Int16), uuid=620), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), )] (%1142:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)], %1143:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=620), constant:[-20]]) -> (%1144:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
-            linalg.CPU.EqualOp <name="model.layers.18.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=621), outputs_0:QuantSpec(Raw(type: UInt8), uuid=622), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1145:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=621), constant:[0.24023438]]) -> (%1146:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=622)])
-            linalg.CPU.WhereOp <name="model.layers.18.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=622), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), )] (%1146:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=622)], %1141:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=617)], %1144:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)]) -> (%1147:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)])
-            linalg.CPU.SoftmaxOp <name="model.layers.18.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=623), )] (%1147:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=619)]) -> (%1148:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=623)])
-            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=623), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), )] (%1148:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=623)], %1138:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=49)]) -> (%1149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)])
-            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), )] (%1149:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)]) -> (%1150:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)])
-            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), )] (%1150:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)]) -> (%1150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)])
-            linalg.CPU.LinearOp <name="model.layers.18.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=625))] (%1150:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=624)]) -> (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)])
-            cf.ReturnOp (%1151:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=626)], %1132:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=614)], %1134:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=616)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.18.mlp <CPU> [using_qnn:true, symbol:model.layers.18.mlp] {
-        (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) {
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=629), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=628))] (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%1154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=629)])
-            linalg.CPU.SiLUOp <name="model.layers.18.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=629), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630), )] (%1154:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=629)]) -> (%1155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630)])
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=631))] (%1153:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=627)]) -> (%1156:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)])
-            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630), )] (%1155:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630)], %1156:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=632)]) -> (%1157:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630)])
-            linalg.CPU.LinearOp <name="model.layers.18.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=633))] (%1157:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=630)]) -> (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)])
-            cf.ReturnOp (%1158:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.19 <CPU> [using_qnn:true, symbol:model.layers.19] {
-        (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.19.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), )] (%1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)])
-            graph.CallGraphOp @model.layers.19.self_attn (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)])
-            linalg.CPU.AddOp <name="model.layers.19.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656), )] (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)], %1159:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=634)]) -> (%1193:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657), )] (%1193:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)]) -> (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657)])
-            graph.CallGraphOp @model.layers.19.mlp (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657)]) -> (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)])
-            linalg.CPU.AddOp <name="model.layers.19.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), )] (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %1193:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)]) -> (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)])
-            cf.ReturnOp (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.19.self_attn <CPU> [using_qnn:true, symbol:model.layers.19.self_attn] {
-        (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)]) {
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.q_proj">(%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%1161:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=640)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=636))] (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%1162:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=638))] (%1160:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=635)]) -> (%1163:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=640), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=640), )] (%1161:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=640)]) -> (%1161:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=640)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=640), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=640), )] (%1161:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=640)]) -> (%1164:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=640)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), )] (%1162:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)]) -> (%1162:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), )] (%1162:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)]) -> (%1165:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), )] (%1163:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)]) -> (%1163:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), )] (%1163:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)]) -> (%1166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=640), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), )] (%1164:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=640)]) -> (%1167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)])
-            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), )] (%1165:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=637)]) -> (%1168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)])
-            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), )] (%1167:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1169:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)])
-            linalg.CPU.RoPEOp <name="model.layers.19.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), )] (%1168:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1170:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642), outputs_0:QuantSpec(Raw(type: Float16), uuid=643), )] (%1170:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=642)]) -> (%1171:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=643)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=643), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644), )] (%1171:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=643)]) -> (%1172:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644), )] (%1172:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)]) -> (%1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639), outputs_0:QuantSpec(Raw(type: Float16), uuid=645), )] (%1166:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=639)]) -> (%1174:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=645)])
-            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=645), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646), )] (%1174:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=645)]) -> (%1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)])
-            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%358:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)]) -> (%1176:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
-            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%359:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)]) -> (%1177:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
-            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), )] (%1176:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%1178:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
-            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), )] (%1177:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1179:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)])
-            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), )] (%1169:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=641)], %1178:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%1180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
-            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), inputs_1:QuantSpec(Raw(type: Float32), uuid=648), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), )] (%1180:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)], %1181:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=648), constant:[0.088388346]]) -> (%1182:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)])
-            linalg.CPU.ReduceMinOp <name="model.layers.19.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%1182:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)]) -> (%1183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
-            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), inputs_1:QuantSpec(Raw(type: Int16), uuid=650), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%1183:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)], %1184:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=650), constant:[-20]]) -> (%1185:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
-            linalg.CPU.EqualOp <name="model.layers.19.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=651), outputs_0:QuantSpec(Raw(type: UInt8), uuid=652), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1186:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=651), constant:[0.55078125]]) -> (%1187:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=652)])
-            linalg.CPU.WhereOp <name="model.layers.19.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=652), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), )] (%1187:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=652)], %1182:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=647)], %1185:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%1188:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)])
-            linalg.CPU.SoftmaxOp <name="model.layers.19.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), )] (%1188:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=649)]) -> (%1189:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)])
-            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), )] (%1189:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=653)], %1179:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=50)]) -> (%1190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654)])
-            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), )] (%1190:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654)]) -> (%1191:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654)])
-            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), )] (%1191:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654)]) -> (%1191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654)])
-            linalg.CPU.LinearOp <name="model.layers.19.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=655))] (%1191:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=654)]) -> (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)])
-            cf.ReturnOp (%1192:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=656)], %1173:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=644)], %1175:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=646)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.19.mlp <CPU> [using_qnn:true, symbol:model.layers.19.mlp] {
-        (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657)]) -> (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)]) {
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=658))] (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657)]) -> (%1195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)])
-            linalg.CPU.SiLUOp <name="model.layers.19.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), )] (%1195:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=659)]) -> (%1196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)])
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=662), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=661))] (%1194:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=657)]) -> (%1197:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=662)])
-            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=662), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), )] (%1196:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)], %1197:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=662)]) -> (%1198:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)])
-            linalg.CPU.LinearOp <name="model.layers.19.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=663))] (%1198:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=660)]) -> (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)])
-            cf.ReturnOp (%1199:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.20 <CPU> [using_qnn:true, symbol:model.layers.20] {
-        (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.20.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), )] (%1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)]) -> (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)])
-            graph.CallGraphOp @model.layers.20.self_attn (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)])
-            linalg.CPU.AddOp <name="model.layers.20.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), )] (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)], %1200:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=664)]) -> (%1234:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), )] (%1234:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)]) -> (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)])
-            graph.CallGraphOp @model.layers.20.mlp (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)]) -> (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)])
-            linalg.CPU.AddOp <name="model.layers.20.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), )] (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %1234:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)]) -> (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)])
-            cf.ReturnOp (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.20.self_attn <CPU> [using_qnn:true, symbol:model.layers.20.self_attn] {
-        (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)]) {
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.q_proj">(%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)]) -> (%1202:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=670)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=666))] (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)]) -> (%1203:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=668))] (%1201:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=665)]) -> (%1204:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=670), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=670), )] (%1202:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=670)]) -> (%1202:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=670)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=670), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=670), )] (%1202:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=670)]) -> (%1205:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=670)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), )] (%1203:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)]) -> (%1203:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), )] (%1203:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)]) -> (%1206:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), )] (%1204:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%1204:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), )] (%1204:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%1207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=670), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671), )] (%1205:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=670)]) -> (%1208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671)])
-            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), )] (%1206:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=667)]) -> (%1209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)])
-            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671), )] (%1208:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1210:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671)])
-            linalg.CPU.RoPEOp <name="model.layers.20.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), )] (%1209:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1211:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672), outputs_0:QuantSpec(Raw(type: Float16), uuid=673), )] (%1211:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=672)]) -> (%1212:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=673)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=673), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674), )] (%1212:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=673)]) -> (%1213:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674), )] (%1213:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)]) -> (%1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669), outputs_0:QuantSpec(Raw(type: Float16), uuid=675), )] (%1207:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=669)]) -> (%1215:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=675)])
-            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=675), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676), )] (%1215:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=675)]) -> (%1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)])
-            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%360:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)]) -> (%1217:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
-            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%361:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)]) -> (%1218:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
-            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), )] (%1217:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%1219:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
-            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), )] (%1218:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1220:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)])
-            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), )] (%1210:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=671)], %1219:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%1221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
-            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), inputs_1:QuantSpec(Raw(type: Float32), uuid=678), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), )] (%1221:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %1222:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=678), constant:[0.088388346]]) -> (%1223:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)])
-            linalg.CPU.ReduceMinOp <name="model.layers.20.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), )] (%1223:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)]) -> (%1224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679)])
-            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), inputs_1:QuantSpec(Raw(type: Int16), uuid=680), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), )] (%1224:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679)], %1225:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=680), constant:[-20]]) -> (%1226:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679)])
-            linalg.CPU.EqualOp <name="model.layers.20.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=681), outputs_0:QuantSpec(Raw(type: UInt8), uuid=682), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1227:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=681), constant:[0.71875]]) -> (%1228:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=682)])
-            linalg.CPU.WhereOp <name="model.layers.20.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=682), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), )] (%1228:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=682)], %1223:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=677)], %1226:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679)]) -> (%1229:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679)])
-            linalg.CPU.SoftmaxOp <name="model.layers.20.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), )] (%1229:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=679)]) -> (%1230:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)])
-            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), )] (%1230:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=683)], %1220:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=51)]) -> (%1231:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)])
-            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), )] (%1231:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)]) -> (%1232:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)])
-            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), )] (%1232:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)]) -> (%1232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)])
-            linalg.CPU.LinearOp <name="model.layers.20.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=685))] (%1232:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=684)]) -> (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)])
-            cf.ReturnOp (%1233:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=686)], %1214:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=674)], %1216:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=676)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.20.mlp <CPU> [using_qnn:true, symbol:model.layers.20.mlp] {
-        (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)]) -> (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)]) {
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=689), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=688))] (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)]) -> (%1236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=689)])
-            linalg.CPU.SiLUOp <name="model.layers.20.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=689), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690), )] (%1236:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=689)]) -> (%1237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690)])
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=691))] (%1235:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=687)]) -> (%1238:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)])
-            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690), )] (%1237:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690)], %1238:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=692)]) -> (%1239:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690)])
-            linalg.CPU.LinearOp <name="model.layers.20.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=693))] (%1239:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=690)]) -> (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)])
-            cf.ReturnOp (%1240:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.21 <CPU> [using_qnn:true, symbol:model.layers.21] {
-        (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.21.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), )] (%1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)]) -> (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)])
-            graph.CallGraphOp @model.layers.21.self_attn (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)])
-            linalg.CPU.AddOp <name="model.layers.21.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), )] (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)], %1241:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=694)]) -> (%1275:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), )] (%1275:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)]) -> (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)])
-            graph.CallGraphOp @model.layers.21.mlp (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)])
-            linalg.CPU.AddOp <name="model.layers.21.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724), )] (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)], %1275:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)]) -> (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)])
-            cf.ReturnOp (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.21.self_attn <CPU> [using_qnn:true, symbol:model.layers.21.self_attn] {
-        (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)]) {
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.q_proj">(%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%1243:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=700)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=696))] (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%1244:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=698))] (%1242:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=695)]) -> (%1245:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=700), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=700), )] (%1243:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=700)]) -> (%1243:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=700)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=700), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=700), )] (%1243:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=700)]) -> (%1246:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=700)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697), )] (%1244:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697)]) -> (%1244:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697), )] (%1244:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697)]) -> (%1247:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%1245:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)]) -> (%1245:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), )] (%1245:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)]) -> (%1248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=700), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), )] (%1246:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=700)]) -> (%1249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)])
-            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), )] (%1247:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=697)]) -> (%1250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
-            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), )] (%1249:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1251:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)])
-            linalg.CPU.RoPEOp <name="model.layers.21.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), )] (%1250:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1252:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702), outputs_0:QuantSpec(Raw(type: Float16), uuid=703), )] (%1252:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=702)]) -> (%1253:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=703)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=703), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704), )] (%1253:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=703)]) -> (%1254:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704), )] (%1254:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)]) -> (%1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699), outputs_0:QuantSpec(Raw(type: Float16), uuid=705), )] (%1248:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=699)]) -> (%1256:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=705)])
-            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=705), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706), )] (%1256:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=705)]) -> (%1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)])
-            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%362:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)]) -> (%1258:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
-            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%363:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)]) -> (%1259:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
-            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), )] (%1258:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%1260:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
-            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), )] (%1259:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1261:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)])
-            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), )] (%1251:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=701)], %1260:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%1262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)])
-            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), inputs_1:QuantSpec(Raw(type: Float32), uuid=708), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), )] (%1262:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)], %1263:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=708), constant:[0.088388346]]) -> (%1264:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)])
-            linalg.CPU.ReduceMinOp <name="model.layers.21.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), )] (%1264:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)]) -> (%1265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)])
-            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), inputs_1:QuantSpec(Raw(type: Int16), uuid=710), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), )] (%1265:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)], %1266:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=710), constant:[-20]]) -> (%1267:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)])
-            linalg.CPU.EqualOp <name="model.layers.21.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=711), outputs_0:QuantSpec(Raw(type: UInt8), uuid=712), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1268:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=711), constant:[-0.80859375]]) -> (%1269:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=712)])
-            linalg.CPU.WhereOp <name="model.layers.21.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=712), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), )] (%1269:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=712)], %1264:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=707)], %1267:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)]) -> (%1270:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)])
-            linalg.CPU.SoftmaxOp <name="model.layers.21.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713), )] (%1270:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=709)]) -> (%1271:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713)])
-            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), )] (%1271:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=713)], %1261:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=52)]) -> (%1272:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)])
-            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), )] (%1272:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)]) -> (%1273:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)])
-            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), )] (%1273:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)]) -> (%1273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)])
-            linalg.CPU.LinearOp <name="model.layers.21.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=715))] (%1273:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=714)]) -> (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)])
-            cf.ReturnOp (%1274:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=716)], %1255:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=704)], %1257:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=706)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.21.mlp <CPU> [using_qnn:true, symbol:model.layers.21.mlp] {
-        (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)]) {
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=718))] (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%1277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)])
-            linalg.CPU.SiLUOp <name="model.layers.21.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), )] (%1277:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=719)]) -> (%1278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)])
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=721))] (%1276:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=717)]) -> (%1279:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722)])
-            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), )] (%1278:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)], %1279:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=722)]) -> (%1280:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)])
-            linalg.CPU.LinearOp <name="model.layers.21.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=723))] (%1280:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=720)]) -> (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)])
-            cf.ReturnOp (%1281:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.22 <CPU> [using_qnn:true, symbol:model.layers.22] {
-        (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.22.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725), )] (%1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)]) -> (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725)])
-            graph.CallGraphOp @model.layers.22.self_attn (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)])
-            linalg.CPU.AddOp <name="model.layers.22.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), )] (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %1282:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=724)]) -> (%1316:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747), )] (%1316:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747)])
-            graph.CallGraphOp @model.layers.22.mlp (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747)]) -> (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)])
-            linalg.CPU.AddOp <name="model.layers.22.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), )] (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %1316:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)]) -> (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)])
-            cf.ReturnOp (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.22.self_attn <CPU> [using_qnn:true, symbol:model.layers.22.self_attn] {
-        (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)]) {
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.q_proj">(%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725)]) -> (%1284:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=730)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=726))] (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725)]) -> (%1285:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=728))] (%1283:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=725)]) -> (%1286:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=730), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=730), )] (%1284:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=730)]) -> (%1284:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=730)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=730), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=730), )] (%1284:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=730)]) -> (%1287:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=730)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), )] (%1285:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)]) -> (%1285:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), )] (%1285:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)]) -> (%1288:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%1286:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%1286:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), )] (%1286:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%1289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=730), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731), )] (%1287:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=730)]) -> (%1290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731)])
-            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732), )] (%1288:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=727)]) -> (%1291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732)])
-            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731), )] (%1290:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1292:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731)])
-            linalg.CPU.RoPEOp <name="model.layers.22.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732), )] (%1291:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1293:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732), outputs_0:QuantSpec(Raw(type: Float16), uuid=733), )] (%1293:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=732)]) -> (%1294:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=733)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=733), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734), )] (%1294:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=733)]) -> (%1295:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734), )] (%1295:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)]) -> (%1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729), outputs_0:QuantSpec(Raw(type: Float16), uuid=735), )] (%1289:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=729)]) -> (%1297:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=735)])
-            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=735), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736), )] (%1297:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=735)]) -> (%1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)])
-            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%364:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)]) -> (%1299:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
-            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%365:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)]) -> (%1300:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
-            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), )] (%1299:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%1301:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
-            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), )] (%1300:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1302:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)])
-            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), )] (%1292:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=731)], %1301:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%1303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)])
-            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), inputs_1:QuantSpec(Raw(type: Float32), uuid=738), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), )] (%1303:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)], %1304:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=738), constant:[0.088388346]]) -> (%1305:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)])
-            linalg.CPU.ReduceMinOp <name="model.layers.22.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), )] (%1305:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)]) -> (%1306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)])
-            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), inputs_1:QuantSpec(Raw(type: Int16), uuid=740), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), )] (%1306:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)], %1307:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=740), constant:[-20]]) -> (%1308:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)])
-            linalg.CPU.EqualOp <name="model.layers.22.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=741), outputs_0:QuantSpec(Raw(type: UInt8), uuid=742), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1309:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=741), constant:[-0.42773438]]) -> (%1310:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=742)])
-            linalg.CPU.WhereOp <name="model.layers.22.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=742), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), )] (%1310:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=742)], %1305:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=737)], %1308:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)]) -> (%1311:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)])
-            linalg.CPU.SoftmaxOp <name="model.layers.22.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743), )] (%1311:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=739)]) -> (%1312:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743)])
-            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), )] (%1312:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=743)], %1302:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=53)]) -> (%1313:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)])
-            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), )] (%1313:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) -> (%1314:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)])
-            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), )] (%1314:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) -> (%1314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)])
-            linalg.CPU.LinearOp <name="model.layers.22.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=745))] (%1314:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=744)]) -> (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)])
-            cf.ReturnOp (%1315:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=746)], %1296:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=734)], %1298:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=736)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.22.mlp <CPU> [using_qnn:true, symbol:model.layers.22.mlp] {
-        (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747)]) -> (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)]) {
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=748))] (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747)]) -> (%1318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)])
-            linalg.CPU.SiLUOp <name="model.layers.22.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), )] (%1318:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=749)]) -> (%1319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)])
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=751))] (%1317:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=747)]) -> (%1320:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752)])
-            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), )] (%1319:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)], %1320:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=752)]) -> (%1321:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)])
-            linalg.CPU.LinearOp <name="model.layers.22.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=753))] (%1321:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=750)]) -> (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)])
-            cf.ReturnOp (%1322:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.23 <CPU> [using_qnn:true, symbol:model.layers.23] {
-        (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.23.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), )] (%1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)]) -> (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)])
-            graph.CallGraphOp @model.layers.23.self_attn (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)])
-            linalg.CPU.AddOp <name="model.layers.23.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), )] (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)], %1323:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=754)]) -> (%1357:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), )] (%1357:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)]) -> (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)])
-            graph.CallGraphOp @model.layers.23.mlp (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)]) -> (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)])
-            linalg.CPU.AddOp <name="model.layers.23.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), )] (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)], %1357:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)]) -> (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)])
-            cf.ReturnOp (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.23.self_attn <CPU> [using_qnn:true, symbol:model.layers.23.self_attn] {
-        (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)]) {
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.q_proj">(%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)]) -> (%1325:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=760)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=756))] (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)]) -> (%1326:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=758))] (%1324:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=755)]) -> (%1327:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=760), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=760), )] (%1325:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=760)]) -> (%1325:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=760)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=760), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=760), )] (%1325:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=760)]) -> (%1328:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=760)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757), )] (%1326:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757)]) -> (%1326:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757), )] (%1326:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757)]) -> (%1329:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759), )] (%1327:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759)]) -> (%1327:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759), )] (%1327:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759)]) -> (%1330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=760), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), )] (%1328:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=760)]) -> (%1331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)])
-            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), )] (%1329:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=757)]) -> (%1332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)])
-            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), )] (%1331:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1333:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)])
-            linalg.CPU.RoPEOp <name="model.layers.23.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), )] (%1332:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1334:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762), outputs_0:QuantSpec(Raw(type: Float16), uuid=763), )] (%1334:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=762)]) -> (%1335:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=763)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=763), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764), )] (%1335:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=763)]) -> (%1336:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764), )] (%1336:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)]) -> (%1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759), outputs_0:QuantSpec(Raw(type: Float16), uuid=765), )] (%1330:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=759)]) -> (%1338:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=765)])
-            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=765), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766), )] (%1338:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=765)]) -> (%1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)])
-            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%366:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)]) -> (%1340:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
-            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%367:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)]) -> (%1341:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
-            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), )] (%1340:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%1342:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
-            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), )] (%1341:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1343:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)])
-            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%1333:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=761)], %1342:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%1344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
-            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), inputs_1:QuantSpec(Raw(type: Float32), uuid=768), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), )] (%1344:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)], %1345:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=768), constant:[0.088388346]]) -> (%1346:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)])
-            linalg.CPU.ReduceMinOp <name="model.layers.23.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), )] (%1346:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)]) -> (%1347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)])
-            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), inputs_1:QuantSpec(Raw(type: Int16), uuid=770), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), )] (%1347:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)], %1348:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=770), constant:[-20]]) -> (%1349:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)])
-            linalg.CPU.EqualOp <name="model.layers.23.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=771), outputs_0:QuantSpec(Raw(type: UInt8), uuid=772), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1350:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=771), constant:[0.96484375]]) -> (%1351:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=772)])
-            linalg.CPU.WhereOp <name="model.layers.23.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=772), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), )] (%1351:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=772)], %1346:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=767)], %1349:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)]) -> (%1352:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)])
-            linalg.CPU.SoftmaxOp <name="model.layers.23.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773), )] (%1352:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=769)]) -> (%1353:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773)])
-            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), )] (%1353:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=773)], %1343:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=54)]) -> (%1354:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
-            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), )] (%1354:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)]) -> (%1355:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
-            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), )] (%1355:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)]) -> (%1355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)])
-            linalg.CPU.LinearOp <name="model.layers.23.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=775))] (%1355:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=774)]) -> (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)])
-            cf.ReturnOp (%1356:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=776)], %1337:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=764)], %1339:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=766)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.23.mlp <CPU> [using_qnn:true, symbol:model.layers.23.mlp] {
-        (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)]) -> (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) {
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=778))] (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)]) -> (%1359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)])
-            linalg.CPU.SiLUOp <name="model.layers.23.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), )] (%1359:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=779)]) -> (%1360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)])
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=781))] (%1358:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=777)]) -> (%1361:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)])
-            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), )] (%1360:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)], %1361:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=782)]) -> (%1362:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)])
-            linalg.CPU.LinearOp <name="model.layers.23.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=783))] (%1362:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=780)]) -> (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)])
-            cf.ReturnOp (%1363:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.24 <CPU> [using_qnn:true, symbol:model.layers.24] {
-        (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.24.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), )] (%1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) -> (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)])
-            graph.CallGraphOp @model.layers.24.self_attn (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)])
-            linalg.CPU.AddOp <name="model.layers.24.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), )] (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)], %1364:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=784)]) -> (%1398:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807), )] (%1398:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)]) -> (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)])
-            graph.CallGraphOp @model.layers.24.mlp (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)]) -> (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)])
-            linalg.CPU.AddOp <name="model.layers.24.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), )] (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %1398:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)]) -> (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)])
-            cf.ReturnOp (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.24.self_attn <CPU> [using_qnn:true, symbol:model.layers.24.self_attn] {
-        (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)]) {
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.q_proj">(%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%1366:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=790)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=786))] (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%1367:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=788))] (%1365:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=785)]) -> (%1368:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=790), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=790), )] (%1366:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=790)]) -> (%1366:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=790)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=790), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=790), )] (%1366:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=790)]) -> (%1369:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=790)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), )] (%1367:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)]) -> (%1367:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), )] (%1367:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)]) -> (%1370:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), )] (%1368:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)]) -> (%1368:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), )] (%1368:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)]) -> (%1371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=790), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791), )] (%1369:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=790)]) -> (%1372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791)])
-            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792), )] (%1370:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=787)]) -> (%1373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792)])
-            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791), )] (%1372:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1374:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791)])
-            linalg.CPU.RoPEOp <name="model.layers.24.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792), )] (%1373:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1375:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792), outputs_0:QuantSpec(Raw(type: Float16), uuid=793), )] (%1375:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=792)]) -> (%1376:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=793)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=793), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), )] (%1376:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=793)]) -> (%1377:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), )] (%1377:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> (%1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789), outputs_0:QuantSpec(Raw(type: Float16), uuid=795), )] (%1371:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=789)]) -> (%1379:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=795)])
-            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=795), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796), )] (%1379:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=795)]) -> (%1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)])
-            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%368:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)]) -> (%1381:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
-            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%369:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)]) -> (%1382:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
-            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), )] (%1381:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%1383:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
-            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), )] (%1382:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1384:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)])
-            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%1374:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=791)], %1383:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%1385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
-            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), inputs_1:QuantSpec(Raw(type: Float32), uuid=798), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), )] (%1385:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)], %1386:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=798), constant:[0.088388346]]) -> (%1387:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)])
-            linalg.CPU.ReduceMinOp <name="model.layers.24.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799), )] (%1387:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)]) -> (%1388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799)])
-            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799), inputs_1:QuantSpec(Raw(type: Int16), uuid=800), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799), )] (%1388:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799)], %1389:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=800), constant:[-20]]) -> (%1390:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799)])
-            linalg.CPU.EqualOp <name="model.layers.24.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=801), outputs_0:QuantSpec(Raw(type: UInt8), uuid=802), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1391:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=801), constant:[0.07910156]]) -> (%1392:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=802)])
-            linalg.CPU.WhereOp <name="model.layers.24.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=802), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799), )] (%1392:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=802)], %1387:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=797)], %1390:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799)]) -> (%1393:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799)])
-            linalg.CPU.SoftmaxOp <name="model.layers.24.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), )] (%1393:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=799)]) -> (%1394:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)])
-            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), )] (%1394:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=803)], %1384:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=55)]) -> (%1395:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
-            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), )] (%1395:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1396:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
-            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), )] (%1396:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)])
-            linalg.CPU.LinearOp <name="model.layers.24.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=805))] (%1396:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=804)]) -> (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)])
-            cf.ReturnOp (%1397:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=806)], %1378:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=794)], %1380:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=796)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.24.mlp <CPU> [using_qnn:true, symbol:model.layers.24.mlp] {
-        (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)]) -> (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) {
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=808))] (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)]) -> (%1400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)])
-            linalg.CPU.SiLUOp <name="model.layers.24.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810), )] (%1400:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=809)]) -> (%1401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810)])
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=811))] (%1399:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=807)]) -> (%1402:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)])
-            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810), )] (%1401:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810)], %1402:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=812)]) -> (%1403:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810)])
-            linalg.CPU.LinearOp <name="model.layers.24.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=813))] (%1403:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=810)]) -> (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)])
-            cf.ReturnOp (%1404:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.25 <CPU> [using_qnn:true, symbol:model.layers.25] {
-        (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.25.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815), )] (%1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815)])
-            graph.CallGraphOp @model.layers.25.self_attn (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
-            linalg.CPU.AddOp <name="model.layers.25.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), )] (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)], %1405:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=814)]) -> (%1439:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), )] (%1439:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)])
-            graph.CallGraphOp @model.layers.25.mlp (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)]) -> (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)])
-            linalg.CPU.AddOp <name="model.layers.25.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), )] (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)], %1439:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)]) -> (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)])
-            cf.ReturnOp (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.25.self_attn <CPU> [using_qnn:true, symbol:model.layers.25.self_attn] {
-        (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) {
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.q_proj">(%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815)]) -> (%1407:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=816))] (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815)]) -> (%1408:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=818))] (%1406:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=815)]) -> (%1409:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), )] (%1407:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%1407:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), )] (%1407:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%1410:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), )] (%1408:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%1408:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), )] (%1408:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%1411:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), )] (%1409:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%1409:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), )] (%1409:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%1412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=820), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), )] (%1410:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=820)]) -> (%1413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)])
-            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), )] (%1411:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=817)]) -> (%1414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)])
-            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), )] (%1413:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1415:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)])
-            linalg.CPU.RoPEOp <name="model.layers.25.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), )] (%1414:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1416:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822), outputs_0:QuantSpec(Raw(type: Float16), uuid=823), )] (%1416:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=822)]) -> (%1417:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=823)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=823), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824), )] (%1417:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=823)]) -> (%1418:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824), )] (%1418:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)]) -> (%1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819), outputs_0:QuantSpec(Raw(type: Float16), uuid=825), )] (%1412:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=819)]) -> (%1420:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=825)])
-            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=825), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), )] (%1420:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=825)]) -> (%1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)])
-            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%370:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)]) -> (%1422:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
-            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%371:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> (%1423:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
-            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), )] (%1422:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%1424:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
-            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), )] (%1423:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1425:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)])
-            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827), )] (%1415:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=821)], %1424:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%1426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827)])
-            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827), inputs_1:QuantSpec(Raw(type: Float32), uuid=828), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827), )] (%1426:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827)], %1427:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=828), constant:[0.088388346]]) -> (%1428:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827)])
-            linalg.CPU.ReduceMinOp <name="model.layers.25.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%1428:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827)]) -> (%1429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
-            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), inputs_1:QuantSpec(Raw(type: Int16), uuid=830), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%1429:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)], %1430:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=830), constant:[-20]]) -> (%1431:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
-            linalg.CPU.EqualOp <name="model.layers.25.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=831), outputs_0:QuantSpec(Raw(type: UInt8), uuid=832), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1432:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=831), constant:[-0.9921875]]) -> (%1433:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=832)])
-            linalg.CPU.WhereOp <name="model.layers.25.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=832), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), )] (%1433:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=832)], %1428:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=827)], %1431:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)]) -> (%1434:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)])
-            linalg.CPU.SoftmaxOp <name="model.layers.25.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=833), )] (%1434:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=829)]) -> (%1435:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=833)])
-            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=833), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), )] (%1435:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=833)], %1425:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=56)]) -> (%1436:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)])
-            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), )] (%1436:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)]) -> (%1437:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)])
-            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), )] (%1437:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)]) -> (%1437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)])
-            linalg.CPU.LinearOp <name="model.layers.25.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=835))] (%1437:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=834)]) -> (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)])
-            cf.ReturnOp (%1438:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=836)], %1419:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=824)], %1421:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=826)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.25.mlp <CPU> [using_qnn:true, symbol:model.layers.25.mlp] {
-        (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)]) -> (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)]) {
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=838))] (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)]) -> (%1441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)])
-            linalg.CPU.SiLUOp <name="model.layers.25.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840), )] (%1441:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=839)]) -> (%1442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840)])
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=841))] (%1440:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=837)]) -> (%1443:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)])
-            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840), )] (%1442:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840)], %1443:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=842)]) -> (%1444:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840)])
-            linalg.CPU.LinearOp <name="model.layers.25.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=843))] (%1444:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=840)]) -> (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)])
-            cf.ReturnOp (%1445:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.26 <CPU> [using_qnn:true, symbol:model.layers.26] {
-        (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.26.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), )] (%1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)]) -> (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)])
-            graph.CallGraphOp @model.layers.26.self_attn (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)])
-            linalg.CPU.AddOp <name="model.layers.26.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866), )] (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)], %1446:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=844)]) -> (%1480:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867), )] (%1480:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)]) -> (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867)])
-            graph.CallGraphOp @model.layers.26.mlp (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867)]) -> (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)])
-            linalg.CPU.AddOp <name="model.layers.26.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), )] (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)], %1480:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)]) -> (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)])
-            cf.ReturnOp (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.26.self_attn <CPU> [using_qnn:true, symbol:model.layers.26.self_attn] {
-        (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)]) {
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.q_proj">(%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)]) -> (%1448:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=850)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=846))] (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)]) -> (%1449:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=848))] (%1447:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=845)]) -> (%1450:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=850), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=850), )] (%1448:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=850)]) -> (%1448:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=850)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=850), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=850), )] (%1448:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=850)]) -> (%1451:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=850)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), )] (%1449:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%1449:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), )] (%1449:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%1452:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), )] (%1450:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849)]) -> (%1450:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), )] (%1450:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849)]) -> (%1453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=850), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), )] (%1451:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=850)]) -> (%1454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
-            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), )] (%1452:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=847)]) -> (%1455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)])
-            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), )] (%1454:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1456:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)])
-            linalg.CPU.RoPEOp <name="model.layers.26.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), )] (%1455:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1457:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852), outputs_0:QuantSpec(Raw(type: Float16), uuid=853), )] (%1457:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=852)]) -> (%1458:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=853)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=853), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854), )] (%1458:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=853)]) -> (%1459:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854), )] (%1459:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)]) -> (%1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849), outputs_0:QuantSpec(Raw(type: Float16), uuid=855), )] (%1453:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=849)]) -> (%1461:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=855)])
-            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=855), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856), )] (%1461:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=855)]) -> (%1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)])
-            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%372:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)]) -> (%1463:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
-            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%373:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)]) -> (%1464:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
-            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), )] (%1463:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%1465:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
-            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), )] (%1464:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1466:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)])
-            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), )] (%1456:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=851)], %1465:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%1467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)])
-            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), inputs_1:QuantSpec(Raw(type: Float32), uuid=858), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), )] (%1467:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)], %1468:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=858), constant:[0.088388346]]) -> (%1469:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)])
-            linalg.CPU.ReduceMinOp <name="model.layers.26.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859), )] (%1469:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)]) -> (%1470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859)])
-            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859), inputs_1:QuantSpec(Raw(type: Int16), uuid=860), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859), )] (%1470:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859)], %1471:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=860), constant:[-20]]) -> (%1472:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859)])
-            linalg.CPU.EqualOp <name="model.layers.26.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=861), outputs_0:QuantSpec(Raw(type: UInt8), uuid=862), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1473:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=861), constant:[0.27929688]]) -> (%1474:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=862)])
-            linalg.CPU.WhereOp <name="model.layers.26.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=862), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859), )] (%1474:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=862)], %1469:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=857)], %1472:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859)]) -> (%1475:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859)])
-            linalg.CPU.SoftmaxOp <name="model.layers.26.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), )] (%1475:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=859)]) -> (%1476:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)])
-            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), )] (%1476:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=863)], %1466:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=57)]) -> (%1477:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)])
-            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), )] (%1477:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)]) -> (%1478:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)])
-            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), )] (%1478:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)]) -> (%1478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)])
-            linalg.CPU.LinearOp <name="model.layers.26.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=865))] (%1478:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=864)]) -> (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)])
-            cf.ReturnOp (%1479:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=866)], %1460:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=854)], %1462:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=856)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.26.mlp <CPU> [using_qnn:true, symbol:model.layers.26.mlp] {
-        (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867)]) -> (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)]) {
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=868))] (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867)]) -> (%1482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)])
-            linalg.CPU.SiLUOp <name="model.layers.26.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%1482:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=869)]) -> (%1483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=871))] (%1481:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=867)]) -> (%1484:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)])
-            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), )] (%1483:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)], %1484:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=872)]) -> (%1485:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)])
-            linalg.CPU.LinearOp <name="model.layers.26.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=873))] (%1485:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=870)]) -> (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)])
-            cf.ReturnOp (%1486:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.27 <CPU> [using_qnn:true, symbol:model.layers.27] {
-        (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) {
-            linalg.CPU.RMSNormOp <name="model.layers.27.input_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875), )] (%1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)]) -> (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)])
-            graph.CallGraphOp @model.layers.27.self_attn (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)])
-            linalg.CPU.AddOp <name="model.layers.27.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), )] (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)], %1487:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=874)]) -> (%1521:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.post_attention_layernorm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), )] (%1521:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)]) -> (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)])
-            graph.CallGraphOp @model.layers.27.mlp (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)]) -> (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
-            linalg.CPU.AddOp <name="model.layers.27.Add.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), )] (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)], %1521:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)]) -> (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
-            cf.ReturnOp (%1528:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.27.self_attn <CPU> [using_qnn:true, symbol:model.layers.27.self_attn] {
-        (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)], %319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) {
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.q_proj">(%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)]) -> (%1489:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=880)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.k_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=876))] (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)]) -> (%1490:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.v_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=878))] (%1488:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=875)]) -> (%1491:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=880), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=880), )] (%1489:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=880)]) -> (%1489:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=880)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=880), outputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=880), )] (%1489:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=880)]) -> (%1492:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=880)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), )] (%1490:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)]) -> (%1490:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), )] (%1490:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)]) -> (%1493:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), )] (%1491:tensor<[1, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)]) -> (%1491:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), )] (%1491:tensor<[1, 32, 8, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)]) -> (%1494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.q_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Int16PerTensor), uuid=880), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), )] (%1492:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(Raw(type: Int16PerTensor), uuid=880)]) -> (%1495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
-            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.k_norm"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), )] (%1493:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=877)]) -> (%1496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)])
-            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.q_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), )] (%1495:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1497:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)])
-            linalg.CPU.RoPEOp <name="model.layers.27.self_attn.k_rope"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), )] (%1496:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)], %379:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=62)], %380:tensor<[1, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=64)]) -> (%1498:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882), outputs_0:QuantSpec(Raw(type: Float16), uuid=883), )] (%1498:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=882)]) -> (%1499:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=883)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=883), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884), )] (%1499:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=883)]) -> (%1500:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884), )] (%1500:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)]) -> (%1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.2"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879), outputs_0:QuantSpec(Raw(type: Float16), uuid=885), )] (%1494:tensor<[1, 8, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=879)]) -> (%1502:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=885)])
-            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=885), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886), )] (%1502:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=885)]) -> (%1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)])
-            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%374:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)]) -> (%1504:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
-            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%375:tensor<[1, 8, 992, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) -> (%1505:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
-            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), )] (%1504:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%1506:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
-            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), )] (%1505:tensor<[1, 8, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1507:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)])
-            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), )] (%1497:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=881)], %1506:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%1508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
-            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), inputs_1:QuantSpec(Raw(type: Float32), uuid=888), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), )] (%1508:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)], %1509:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=888), constant:[0.088388346]]) -> (%1510:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)])
-            linalg.CPU.ReduceMinOp <name="model.layers.27.self_attn.ReduceMin.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), )] (%1510:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)]) -> (%1511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
-            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), inputs_1:QuantSpec(Raw(type: Int16), uuid=890), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), )] (%1511:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)], %1512:tensor<[1], Int16, CPU>[quant_recipe:QuantSpec(Raw(type: Int16), uuid=890), constant:[-20]]) -> (%1513:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
-            linalg.CPU.EqualOp <name="model.layers.27.self_attn.Equal.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2), inputs_1:QuantSpec(Raw(type: UInt16), uuid=891), outputs_0:QuantSpec(Raw(type: UInt8), uuid=892), )] (%319:tensor<[1, 1, 32, 1024], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %1514:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=891), constant:[0.890625]]) -> (%1515:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=892)])
-            linalg.CPU.WhereOp <name="model.layers.27.self_attn.Where.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=892), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887), inputs_2:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), )] (%1515:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=892)], %1510:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=887)], %1513:tensor<[1, 16, 32, 1], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)]) -> (%1516:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)])
-            linalg.CPU.SoftmaxOp <name="model.layers.27.self_attn.Softmax.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=893), )] (%1516:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=889)]) -> (%1517:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=893)])
-            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.1"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=893), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894), )] (%1517:tensor<[1, 16, 32, 1024], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=893)], %1507:tensor<[1, 16, 1024, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=58)]) -> (%1518:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894)])
-            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.4"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894), )] (%1518:tensor<[1, 16, 32, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894)]) -> (%1519:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894)])
-            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.3"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894), )] (%1519:tensor<[1, 32, 16, 128], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894)]) -> (%1519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894)])
-            linalg.CPU.LinearOp <name="model.layers.27.self_attn.o_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=895))] (%1519:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=894)]) -> (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)])
-            cf.ReturnOp (%1520:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=896)], %1501:tensor<[1, 8, 128, 32], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=884)], %1503:tensor<[1, 8, 32, 128], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=886)]) -> ()
-        }
-    }
-    graph.SubGraphOp @model.layers.27.mlp <CPU> [using_qnn:true, symbol:model.layers.27.mlp] {
-        (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)]) -> (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) {
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.gate_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=898))] (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)]) -> (%1523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)])
-            linalg.CPU.SiLUOp <name="model.layers.27.mlp.act"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900), )] (%1523:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=899)]) -> (%1524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900)])
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.up_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=901))] (%1522:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=897)]) -> (%1525:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902)])
-            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.0"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900), inputs_1:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900), )] (%1524:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900)], %1525:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=902)]) -> (%1526:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900)])
-            linalg.CPU.LinearOp <name="model.layers.27.mlp.down_proj"> [quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900), outputs_0:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=903))] (%1526:tensor<[1, 32, 6144], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=900)]) -> (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)])
-            cf.ReturnOp (%1527:tensor<[1, 32, 2048], Int16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -32768, quant_max: 32767, quant_to_type: Int16, scale_type: Float32), uuid=904)]) -> ()
+        (%8206:tensor<[1, 32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8264:tensor<[32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=1)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) {
+            graph.CallGraphOp @model.0.s32 (%8206:tensor<[1, 32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8264:tensor<[32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=1)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
+            cf.ReturnOp (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> ()
         }
     }
     //     ╔═════╗   
@@ -1905,5 +12,1612 @@
     //    ╚═════╝   
     //     ║   ║     
     //    ╱╩╦╦╩╲    
+    graph.SubGraphOp @model.0.s32 <notype> [use_qnn:true, symbol:model.0.s32] {
+        (%8206:tensor<[1, 32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)], %8264:tensor<[32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=1)], %8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) {
+            linalg.CPU.EmbeddingOp <name="model.embed_tokens"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0] (%8206:tensor<[1, 32], Int32, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: Int32), uuid=0)]) -> (%8265:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)])
+            linalg.CPU.CastTypeOp <name="model.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float32), uuid=59, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), weight_weight:QuantSpec(Raw(type: Float32), uuid=61, solved=0))] (%8265:tensor<[1, 32, 2048], Float32, CPU>[quant_recipe:QuantSpec(Raw(type: Float32), uuid=59)]) -> (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.IndexOp <name="model.Index.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=62, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8204:tensor<[1024, 5, 128], UInt16PerTensor, CPU>[@rope_sin][quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=62), symbol:rope_sin]) -> (%8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.IndexOp <name="model.Index.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=64, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8205:tensor<[1024, 5, 128], UInt16PerTensor, CPU>[@rope_cos][quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=64), symbol:rope_cos]) -> (%8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=67, solved=0))] (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=68, solved=0))] (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)]) -> (%8270:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=70, solved=0))] (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)]) -> (%8271:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=72, solved=0))] (%8269:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=66)]) -> (%8272:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), )] (%8270:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)]) -> (%8270:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), )] (%8270:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)]) -> (%8273:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), )] (%8271:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)]) -> (%8271:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), )] (%8271:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)]) -> (%8274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), )] (%8272:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)]) -> (%8272:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), )] (%8272:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)]) -> (%8275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=75, solved=0))] (%8273:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=69)]) -> (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=77, solved=0))] (%8274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=71)]) -> (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.NegOp <name="model.layers.0.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8278:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8278:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8279:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8279:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8280:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8276:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8281:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), )] (%8281:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8280:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)]) -> (%8282:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.SliceOp <name="model.layers.0.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.NegOp <name="model.layers.0.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8283:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8283:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8284:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8284:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8285:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8277:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8286:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), )] (%8286:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)], %8285:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8287:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=78, solved=0), )] (%8287:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=76)]) -> (%8288:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=78)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=78, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), )] (%8288:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=78)]) -> (%8289:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), )] (%8289:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)]) -> (%8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=80, solved=0), )] (%8275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=73)]) -> (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=80)])
+            linalg.CPU.CastTypeOp <name="model.layers.0.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=80, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81, solved=0), )] (%8292:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=80)]) -> (%8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), )] (%8208:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)]) -> (%8295:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
+            linalg.CPU.ConcatOp <name="model.layers.0.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), )] (%8209:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)]) -> (%8296:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)])
+            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), )] (%8295:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8297:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)])
+            linalg.CPU.RepeatOp <name="model.layers.0.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), )] (%8296:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8298:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)])
+            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), )] (%8282:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=74)], %8297:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=3)]) -> (%8299:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)])
+            linalg.CPU.MulOp <name="model.layers.0.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=83, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), )] (%8299:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)], %8300:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=83), constant:[0.088388346]]) -> (%8301:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)])
+            linalg.CPU.ReduceMinOp <name="model.layers.0.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), )] (%8301:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)]) -> (%8302:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)])
+            linalg.CPU.AddOp <name="model.layers.0.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=85, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), )] (%8302:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)], %8303:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=85), constant:[-20]]) -> (%8304:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)])
+            linalg.CPU.EqualOp <name="model.layers.0.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=86, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=87, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8305:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=86), constant:[0]]) -> (%8306:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=87)])
+            linalg.CPU.WhereOp <name="model.layers.0.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=87, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), )] (%8306:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=87)], %8301:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=82)], %8304:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)]) -> (%8307:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)])
+            linalg.CPU.SoftmaxOp <name="model.layers.0.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88, solved=0), )] (%8307:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=84)]) -> (%8308:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88)])
+            linalg.CPU.MatMulOp <name="model.layers.0.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), )] (%8308:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=88)], %8298:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=31)]) -> (%8309:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)])
+            linalg.CPU.TransposeOp <name="model.layers.0.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), )] (%8309:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)]) -> (%8310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)])
+            linalg.CPU.ViewOp <name="model.layers.0.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), )] (%8310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)]) -> (%8310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)])
+            linalg.CPU.LinearOp <name="model.layers.0.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=90, solved=0))] (%8310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=89)]) -> (%8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)])
+            linalg.CPU.AddOp <name="model.layers.0.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8266:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8311:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=91)]) -> (%8312:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.0.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=93, solved=0))] (%8312:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=94, solved=0))] (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)]) -> (%8314:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=96, solved=0))] (%8313:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=92)]) -> (%8315:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)])
+            linalg.CPU.SigmoidOp <name="model.layers.0.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98, solved=0), )] (%8315:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)]) -> (%8316:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98)])
+            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), )] (%8315:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)], %8316:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=98)]) -> (%8317:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)])
+            linalg.CPU.MulOp <name="model.layers.0.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), )] (%8317:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)], %8314:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=95)]) -> (%8318:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)])
+            linalg.CPU.LinearOp <name="model.layers.0.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=99, solved=0))] (%8318:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=97)]) -> (%8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)])
+            linalg.CPU.AddOp <name="model.layers.0.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8312:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8319:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=100)]) -> (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=102, solved=0))] (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=103, solved=0))] (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)]) -> (%8322:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=105, solved=0))] (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)]) -> (%8323:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=107, solved=0))] (%8321:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=101)]) -> (%8324:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), )] (%8322:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)]) -> (%8322:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), )] (%8322:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)]) -> (%8325:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), )] (%8323:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)]) -> (%8323:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), )] (%8323:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)]) -> (%8326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), )] (%8324:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)]) -> (%8324:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), )] (%8324:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)]) -> (%8327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=110, solved=0))] (%8325:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=104)]) -> (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=112, solved=0))] (%8326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=106)]) -> (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.NegOp <name="model.layers.1.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8330:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8330:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8331:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8331:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8332:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8328:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8333:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), )] (%8333:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8332:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)]) -> (%8334:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.SliceOp <name="model.layers.1.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.NegOp <name="model.layers.1.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8335:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8335:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8336:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8336:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8337:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8329:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8338:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), )] (%8338:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)], %8337:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8339:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=113, solved=0), )] (%8339:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=111)]) -> (%8340:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=113, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), )] (%8340:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=113)]) -> (%8341:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), )] (%8341:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)]) -> (%8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=115, solved=0), )] (%8327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=108)]) -> (%8344:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=115)])
+            linalg.CPU.CastTypeOp <name="model.layers.1.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=115, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116, solved=0), )] (%8344:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=115)]) -> (%8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), )] (%8210:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)]) -> (%8347:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
+            linalg.CPU.ConcatOp <name="model.layers.1.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), )] (%8211:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)]) -> (%8348:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)])
+            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), )] (%8347:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8349:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)])
+            linalg.CPU.RepeatOp <name="model.layers.1.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), )] (%8348:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8350:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)])
+            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), )] (%8334:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=109)], %8349:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=4)]) -> (%8351:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)])
+            linalg.CPU.MulOp <name="model.layers.1.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=118, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), )] (%8351:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)], %8352:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=118), constant:[0.088388346]]) -> (%8353:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)])
+            linalg.CPU.ReduceMinOp <name="model.layers.1.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), )] (%8353:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)]) -> (%8354:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)])
+            linalg.CPU.AddOp <name="model.layers.1.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=120, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), )] (%8354:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)], %8355:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=120), constant:[-20]]) -> (%8356:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)])
+            linalg.CPU.EqualOp <name="model.layers.1.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=121, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=122, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8357:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=121), constant:[0]]) -> (%8358:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=122)])
+            linalg.CPU.WhereOp <name="model.layers.1.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=122, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), )] (%8358:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=122)], %8353:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=117)], %8356:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)]) -> (%8359:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)])
+            linalg.CPU.SoftmaxOp <name="model.layers.1.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123, solved=0), )] (%8359:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=119)]) -> (%8360:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123)])
+            linalg.CPU.MatMulOp <name="model.layers.1.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), )] (%8360:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=123)], %8350:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=32)]) -> (%8361:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)])
+            linalg.CPU.TransposeOp <name="model.layers.1.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), )] (%8361:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)]) -> (%8362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)])
+            linalg.CPU.ViewOp <name="model.layers.1.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), )] (%8362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)]) -> (%8362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)])
+            linalg.CPU.LinearOp <name="model.layers.1.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=125, solved=0))] (%8362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=124)]) -> (%8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)])
+            linalg.CPU.AddOp <name="model.layers.1.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8320:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8363:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=126)]) -> (%8364:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.1.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=128, solved=0))] (%8364:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=129, solved=0))] (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)]) -> (%8366:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=131, solved=0))] (%8365:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=127)]) -> (%8367:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)])
+            linalg.CPU.SigmoidOp <name="model.layers.1.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133, solved=0), )] (%8367:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)]) -> (%8368:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133)])
+            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), )] (%8367:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)], %8368:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=133)]) -> (%8369:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)])
+            linalg.CPU.MulOp <name="model.layers.1.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), )] (%8369:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)], %8366:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=130)]) -> (%8370:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)])
+            linalg.CPU.LinearOp <name="model.layers.1.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=134, solved=0))] (%8370:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=132)]) -> (%8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)])
+            linalg.CPU.AddOp <name="model.layers.1.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8364:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8371:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=135)]) -> (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=137, solved=0))] (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=138, solved=0))] (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)]) -> (%8374:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=140, solved=0))] (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)]) -> (%8375:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=142, solved=0))] (%8373:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=136)]) -> (%8376:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), )] (%8374:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)]) -> (%8374:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), )] (%8374:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)]) -> (%8377:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), )] (%8375:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)]) -> (%8375:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), )] (%8375:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)]) -> (%8378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), )] (%8376:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)]) -> (%8376:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), )] (%8376:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)]) -> (%8379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=145, solved=0))] (%8377:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=139)]) -> (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=147, solved=0))] (%8378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=141)]) -> (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.NegOp <name="model.layers.2.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8382:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8382:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8383:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8383:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8384:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8380:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8385:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), )] (%8385:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8384:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)]) -> (%8386:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.SliceOp <name="model.layers.2.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.NegOp <name="model.layers.2.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8387:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8387:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8388:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8388:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8389:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8381:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8390:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), )] (%8390:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)], %8389:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8391:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=148, solved=0), )] (%8391:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=146)]) -> (%8392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=148)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=148, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), )] (%8392:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=148)]) -> (%8393:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), )] (%8393:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)]) -> (%8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=150, solved=0), )] (%8379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=143)]) -> (%8396:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=150)])
+            linalg.CPU.CastTypeOp <name="model.layers.2.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=150, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151, solved=0), )] (%8396:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=150)]) -> (%8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), )] (%8212:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)]) -> (%8399:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
+            linalg.CPU.ConcatOp <name="model.layers.2.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), )] (%8213:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)]) -> (%8400:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)])
+            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), )] (%8399:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8401:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)])
+            linalg.CPU.RepeatOp <name="model.layers.2.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), )] (%8400:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8402:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)])
+            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), )] (%8386:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=144)], %8401:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=5)]) -> (%8403:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)])
+            linalg.CPU.MulOp <name="model.layers.2.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=153, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), )] (%8403:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)], %8404:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=153), constant:[0.088388346]]) -> (%8405:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)])
+            linalg.CPU.ReduceMinOp <name="model.layers.2.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), )] (%8405:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)]) -> (%8406:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)])
+            linalg.CPU.AddOp <name="model.layers.2.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=155, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), )] (%8406:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)], %8407:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=155), constant:[-20]]) -> (%8408:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)])
+            linalg.CPU.EqualOp <name="model.layers.2.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=156, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=157, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8409:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=156), constant:[0]]) -> (%8410:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=157)])
+            linalg.CPU.WhereOp <name="model.layers.2.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=157, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), )] (%8410:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=157)], %8405:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=152)], %8408:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)]) -> (%8411:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)])
+            linalg.CPU.SoftmaxOp <name="model.layers.2.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158, solved=0), )] (%8411:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=154)]) -> (%8412:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158)])
+            linalg.CPU.MatMulOp <name="model.layers.2.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), )] (%8412:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=158)], %8402:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=33)]) -> (%8413:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)])
+            linalg.CPU.TransposeOp <name="model.layers.2.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), )] (%8413:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)]) -> (%8414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)])
+            linalg.CPU.ViewOp <name="model.layers.2.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), )] (%8414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)]) -> (%8414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)])
+            linalg.CPU.LinearOp <name="model.layers.2.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=160, solved=0))] (%8414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=159)]) -> (%8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)])
+            linalg.CPU.AddOp <name="model.layers.2.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8372:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8415:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=161)]) -> (%8416:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.2.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=163, solved=0))] (%8416:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=164, solved=0))] (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)]) -> (%8418:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=166, solved=0))] (%8417:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=162)]) -> (%8419:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)])
+            linalg.CPU.SigmoidOp <name="model.layers.2.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168, solved=0), )] (%8419:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)]) -> (%8420:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168)])
+            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), )] (%8419:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)], %8420:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=168)]) -> (%8421:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)])
+            linalg.CPU.MulOp <name="model.layers.2.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), )] (%8421:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)], %8418:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=165)]) -> (%8422:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)])
+            linalg.CPU.LinearOp <name="model.layers.2.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=169, solved=0))] (%8422:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=167)]) -> (%8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)])
+            linalg.CPU.AddOp <name="model.layers.2.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8416:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8423:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=170)]) -> (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=172, solved=0))] (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=173, solved=0))] (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)]) -> (%8426:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=175, solved=0))] (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)]) -> (%8427:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=177, solved=0))] (%8425:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=171)]) -> (%8428:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), )] (%8426:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)]) -> (%8426:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), )] (%8426:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)]) -> (%8429:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), )] (%8427:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)]) -> (%8427:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), )] (%8427:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)]) -> (%8430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), )] (%8428:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)]) -> (%8428:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), )] (%8428:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)]) -> (%8431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=180, solved=0))] (%8429:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=174)]) -> (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=182, solved=0))] (%8430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=176)]) -> (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.NegOp <name="model.layers.3.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8434:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8434:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8435:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8435:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8436:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8432:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8437:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), )] (%8437:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8436:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)]) -> (%8438:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.SliceOp <name="model.layers.3.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.NegOp <name="model.layers.3.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8439:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8439:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8440:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8440:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8441:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8433:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8442:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), )] (%8442:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)], %8441:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8443:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=183, solved=0), )] (%8443:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=181)]) -> (%8444:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=183)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=183, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), )] (%8444:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=183)]) -> (%8445:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), )] (%8445:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)]) -> (%8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=185, solved=0), )] (%8431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=178)]) -> (%8448:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=185)])
+            linalg.CPU.CastTypeOp <name="model.layers.3.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=185, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186, solved=0), )] (%8448:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=185)]) -> (%8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), )] (%8214:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)]) -> (%8451:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
+            linalg.CPU.ConcatOp <name="model.layers.3.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), )] (%8215:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)]) -> (%8452:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)])
+            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), )] (%8451:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8453:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)])
+            linalg.CPU.RepeatOp <name="model.layers.3.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), )] (%8452:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8454:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)])
+            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), )] (%8438:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=179)], %8453:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=6)]) -> (%8455:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)])
+            linalg.CPU.MulOp <name="model.layers.3.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=188, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), )] (%8455:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)], %8456:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=188), constant:[0.088388346]]) -> (%8457:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)])
+            linalg.CPU.ReduceMinOp <name="model.layers.3.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), )] (%8457:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)]) -> (%8458:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)])
+            linalg.CPU.AddOp <name="model.layers.3.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=190, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), )] (%8458:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)], %8459:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=190), constant:[-20]]) -> (%8460:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)])
+            linalg.CPU.EqualOp <name="model.layers.3.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=191, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=192, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8461:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=191), constant:[0]]) -> (%8462:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=192)])
+            linalg.CPU.WhereOp <name="model.layers.3.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=192, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), )] (%8462:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=192)], %8457:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=187)], %8460:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)]) -> (%8463:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)])
+            linalg.CPU.SoftmaxOp <name="model.layers.3.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193, solved=0), )] (%8463:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=189)]) -> (%8464:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193)])
+            linalg.CPU.MatMulOp <name="model.layers.3.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), )] (%8464:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=193)], %8454:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=34)]) -> (%8465:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)])
+            linalg.CPU.TransposeOp <name="model.layers.3.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), )] (%8465:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)]) -> (%8466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)])
+            linalg.CPU.ViewOp <name="model.layers.3.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), )] (%8466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)]) -> (%8466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)])
+            linalg.CPU.LinearOp <name="model.layers.3.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=195, solved=0))] (%8466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=194)]) -> (%8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)])
+            linalg.CPU.AddOp <name="model.layers.3.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8424:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8467:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=196)]) -> (%8468:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.3.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=198, solved=0))] (%8468:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=199, solved=0))] (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)]) -> (%8470:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=201, solved=0))] (%8469:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=197)]) -> (%8471:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)])
+            linalg.CPU.SigmoidOp <name="model.layers.3.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203, solved=0), )] (%8471:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)]) -> (%8472:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203)])
+            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), )] (%8471:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)], %8472:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=203)]) -> (%8473:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)])
+            linalg.CPU.MulOp <name="model.layers.3.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), )] (%8473:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)], %8470:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=200)]) -> (%8474:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)])
+            linalg.CPU.LinearOp <name="model.layers.3.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=204, solved=0))] (%8474:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=202)]) -> (%8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)])
+            linalg.CPU.AddOp <name="model.layers.3.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8468:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8475:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=205)]) -> (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=207, solved=0))] (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=208, solved=0))] (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)]) -> (%8478:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=210, solved=0))] (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)]) -> (%8479:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=212, solved=0))] (%8477:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=206)]) -> (%8480:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), )] (%8478:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)]) -> (%8478:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), )] (%8478:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)]) -> (%8481:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), )] (%8479:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)]) -> (%8479:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), )] (%8479:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)]) -> (%8482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), )] (%8480:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)]) -> (%8480:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), )] (%8480:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)]) -> (%8483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=215, solved=0))] (%8481:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=209)]) -> (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=217, solved=0))] (%8482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=211)]) -> (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.NegOp <name="model.layers.4.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8486:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8486:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8487:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8487:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8488:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8484:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8489:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), )] (%8489:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8488:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)]) -> (%8490:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.SliceOp <name="model.layers.4.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.NegOp <name="model.layers.4.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8491:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8491:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8492:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8492:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8493:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8485:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8494:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), )] (%8494:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)], %8493:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8495:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=218, solved=0), )] (%8495:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=216)]) -> (%8496:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=218)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=218, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), )] (%8496:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=218)]) -> (%8497:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), )] (%8497:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)]) -> (%8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=220, solved=0), )] (%8483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=213)]) -> (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=220)])
+            linalg.CPU.CastTypeOp <name="model.layers.4.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=220, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221, solved=0), )] (%8500:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=220)]) -> (%8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), )] (%8216:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)]) -> (%8503:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
+            linalg.CPU.ConcatOp <name="model.layers.4.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), )] (%8217:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)]) -> (%8504:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)])
+            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), )] (%8503:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8505:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)])
+            linalg.CPU.RepeatOp <name="model.layers.4.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), )] (%8504:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8506:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)])
+            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), )] (%8490:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=214)], %8505:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=7)]) -> (%8507:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)])
+            linalg.CPU.MulOp <name="model.layers.4.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=223, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), )] (%8507:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)], %8508:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=223), constant:[0.088388346]]) -> (%8509:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)])
+            linalg.CPU.ReduceMinOp <name="model.layers.4.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), )] (%8509:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)]) -> (%8510:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)])
+            linalg.CPU.AddOp <name="model.layers.4.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=225, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), )] (%8510:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)], %8511:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=225), constant:[-20]]) -> (%8512:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)])
+            linalg.CPU.EqualOp <name="model.layers.4.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=226, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=227, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8513:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=226), constant:[0]]) -> (%8514:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=227)])
+            linalg.CPU.WhereOp <name="model.layers.4.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=227, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), )] (%8514:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=227)], %8509:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=222)], %8512:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)]) -> (%8515:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)])
+            linalg.CPU.SoftmaxOp <name="model.layers.4.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228, solved=0), )] (%8515:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=224)]) -> (%8516:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228)])
+            linalg.CPU.MatMulOp <name="model.layers.4.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), )] (%8516:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=228)], %8506:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=35)]) -> (%8517:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)])
+            linalg.CPU.TransposeOp <name="model.layers.4.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), )] (%8517:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)]) -> (%8518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)])
+            linalg.CPU.ViewOp <name="model.layers.4.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), )] (%8518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)]) -> (%8518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)])
+            linalg.CPU.LinearOp <name="model.layers.4.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=230, solved=0))] (%8518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=229)]) -> (%8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)])
+            linalg.CPU.AddOp <name="model.layers.4.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8476:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8519:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=231)]) -> (%8520:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.4.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=233, solved=0))] (%8520:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=234, solved=0))] (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)]) -> (%8522:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=236, solved=0))] (%8521:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=232)]) -> (%8523:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)])
+            linalg.CPU.SigmoidOp <name="model.layers.4.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238, solved=0), )] (%8523:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)]) -> (%8524:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238)])
+            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), )] (%8523:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)], %8524:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=238)]) -> (%8525:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)])
+            linalg.CPU.MulOp <name="model.layers.4.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), )] (%8525:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)], %8522:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=235)]) -> (%8526:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)])
+            linalg.CPU.LinearOp <name="model.layers.4.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=239, solved=0))] (%8526:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=237)]) -> (%8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)])
+            linalg.CPU.AddOp <name="model.layers.4.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8520:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8527:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=240)]) -> (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=242, solved=0))] (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=243, solved=0))] (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)]) -> (%8530:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=245, solved=0))] (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)]) -> (%8531:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=247, solved=0))] (%8529:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=241)]) -> (%8532:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), )] (%8530:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)]) -> (%8530:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), )] (%8530:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)]) -> (%8533:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), )] (%8531:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)]) -> (%8531:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), )] (%8531:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)]) -> (%8534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), )] (%8532:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)]) -> (%8532:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), )] (%8532:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)]) -> (%8535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=250, solved=0))] (%8533:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=244)]) -> (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=252, solved=0))] (%8534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=246)]) -> (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.NegOp <name="model.layers.5.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8538:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8538:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8539:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8539:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8540:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8536:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8541:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), )] (%8541:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8540:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)]) -> (%8542:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.SliceOp <name="model.layers.5.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.NegOp <name="model.layers.5.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8543:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8543:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8544:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8544:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8545:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8537:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8546:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), )] (%8546:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)], %8545:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8547:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=253, solved=0), )] (%8547:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=251)]) -> (%8548:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=253)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=253, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), )] (%8548:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=253)]) -> (%8549:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), )] (%8549:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)]) -> (%8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=255, solved=0), )] (%8535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=248)]) -> (%8552:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=255)])
+            linalg.CPU.CastTypeOp <name="model.layers.5.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=255, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256, solved=0), )] (%8552:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=255)]) -> (%8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), )] (%8218:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)]) -> (%8555:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
+            linalg.CPU.ConcatOp <name="model.layers.5.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), )] (%8219:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)]) -> (%8556:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)])
+            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), )] (%8555:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8557:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)])
+            linalg.CPU.RepeatOp <name="model.layers.5.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), )] (%8556:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8558:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)])
+            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), )] (%8542:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=249)], %8557:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=8)]) -> (%8559:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)])
+            linalg.CPU.MulOp <name="model.layers.5.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=258, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), )] (%8559:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)], %8560:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=258), constant:[0.088388346]]) -> (%8561:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)])
+            linalg.CPU.ReduceMinOp <name="model.layers.5.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), )] (%8561:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)]) -> (%8562:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)])
+            linalg.CPU.AddOp <name="model.layers.5.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=260, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), )] (%8562:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)], %8563:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=260), constant:[-20]]) -> (%8564:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)])
+            linalg.CPU.EqualOp <name="model.layers.5.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=261, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=262, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8565:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=261), constant:[0]]) -> (%8566:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=262)])
+            linalg.CPU.WhereOp <name="model.layers.5.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=262, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), )] (%8566:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=262)], %8561:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=257)], %8564:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)]) -> (%8567:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)])
+            linalg.CPU.SoftmaxOp <name="model.layers.5.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263, solved=0), )] (%8567:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=259)]) -> (%8568:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263)])
+            linalg.CPU.MatMulOp <name="model.layers.5.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), )] (%8568:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=263)], %8558:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=36)]) -> (%8569:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)])
+            linalg.CPU.TransposeOp <name="model.layers.5.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), )] (%8569:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)]) -> (%8570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)])
+            linalg.CPU.ViewOp <name="model.layers.5.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), )] (%8570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)]) -> (%8570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)])
+            linalg.CPU.LinearOp <name="model.layers.5.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=265, solved=0))] (%8570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=264)]) -> (%8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)])
+            linalg.CPU.AddOp <name="model.layers.5.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8528:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8571:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=266)]) -> (%8572:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.5.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=268, solved=0))] (%8572:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=269, solved=0))] (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)]) -> (%8574:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=271, solved=0))] (%8573:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=267)]) -> (%8575:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)])
+            linalg.CPU.SigmoidOp <name="model.layers.5.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273, solved=0), )] (%8575:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)]) -> (%8576:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273)])
+            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), )] (%8575:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)], %8576:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=273)]) -> (%8577:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)])
+            linalg.CPU.MulOp <name="model.layers.5.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), )] (%8577:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)], %8574:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=270)]) -> (%8578:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)])
+            linalg.CPU.LinearOp <name="model.layers.5.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=274, solved=0))] (%8578:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=272)]) -> (%8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)])
+            linalg.CPU.AddOp <name="model.layers.5.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8572:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8579:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=275)]) -> (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=277, solved=0))] (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=278, solved=0))] (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)]) -> (%8582:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=280, solved=0))] (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)]) -> (%8583:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=282, solved=0))] (%8581:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=276)]) -> (%8584:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), )] (%8582:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)]) -> (%8582:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), )] (%8582:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)]) -> (%8585:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), )] (%8583:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)]) -> (%8583:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), )] (%8583:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)]) -> (%8586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), )] (%8584:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)]) -> (%8584:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), )] (%8584:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)]) -> (%8587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=285, solved=0))] (%8585:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=279)]) -> (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=287, solved=0))] (%8586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=281)]) -> (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.NegOp <name="model.layers.6.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8590:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8590:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8591:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8591:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8592:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8588:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8593:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), )] (%8593:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8592:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)]) -> (%8594:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.SliceOp <name="model.layers.6.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.NegOp <name="model.layers.6.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8595:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8595:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8596:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8596:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8597:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8589:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8598:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), )] (%8598:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)], %8597:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8599:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=288, solved=0), )] (%8599:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=286)]) -> (%8600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=288)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=288, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), )] (%8600:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=288)]) -> (%8601:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), )] (%8601:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)]) -> (%8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=290, solved=0), )] (%8587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=283)]) -> (%8604:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=290)])
+            linalg.CPU.CastTypeOp <name="model.layers.6.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=290, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291, solved=0), )] (%8604:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=290)]) -> (%8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), )] (%8220:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)]) -> (%8607:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
+            linalg.CPU.ConcatOp <name="model.layers.6.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), )] (%8221:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)]) -> (%8608:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)])
+            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), )] (%8607:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8609:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)])
+            linalg.CPU.RepeatOp <name="model.layers.6.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), )] (%8608:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8610:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)])
+            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), )] (%8594:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=284)], %8609:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=9)]) -> (%8611:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)])
+            linalg.CPU.MulOp <name="model.layers.6.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=293, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), )] (%8611:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)], %8612:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=293), constant:[0.088388346]]) -> (%8613:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)])
+            linalg.CPU.ReduceMinOp <name="model.layers.6.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), )] (%8613:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)]) -> (%8614:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)])
+            linalg.CPU.AddOp <name="model.layers.6.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=295, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), )] (%8614:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)], %8615:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=295), constant:[-20]]) -> (%8616:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)])
+            linalg.CPU.EqualOp <name="model.layers.6.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=296, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=297, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8617:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=296), constant:[0]]) -> (%8618:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=297)])
+            linalg.CPU.WhereOp <name="model.layers.6.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=297, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), )] (%8618:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=297)], %8613:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=292)], %8616:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)]) -> (%8619:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)])
+            linalg.CPU.SoftmaxOp <name="model.layers.6.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298, solved=0), )] (%8619:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=294)]) -> (%8620:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298)])
+            linalg.CPU.MatMulOp <name="model.layers.6.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), )] (%8620:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=298)], %8610:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=37)]) -> (%8621:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)])
+            linalg.CPU.TransposeOp <name="model.layers.6.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), )] (%8621:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)]) -> (%8622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)])
+            linalg.CPU.ViewOp <name="model.layers.6.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), )] (%8622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)]) -> (%8622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)])
+            linalg.CPU.LinearOp <name="model.layers.6.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=300, solved=0))] (%8622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=299)]) -> (%8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)])
+            linalg.CPU.AddOp <name="model.layers.6.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8580:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8623:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=301)]) -> (%8624:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.6.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=303, solved=0))] (%8624:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=304, solved=0))] (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)]) -> (%8626:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=306, solved=0))] (%8625:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=302)]) -> (%8627:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)])
+            linalg.CPU.SigmoidOp <name="model.layers.6.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308, solved=0), )] (%8627:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)]) -> (%8628:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308)])
+            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), )] (%8627:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)], %8628:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=308)]) -> (%8629:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)])
+            linalg.CPU.MulOp <name="model.layers.6.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), )] (%8629:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)], %8626:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=305)]) -> (%8630:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)])
+            linalg.CPU.LinearOp <name="model.layers.6.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=309, solved=0))] (%8630:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=307)]) -> (%8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)])
+            linalg.CPU.AddOp <name="model.layers.6.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8624:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8631:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=310)]) -> (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=312, solved=0))] (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=313, solved=0))] (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)]) -> (%8634:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=315, solved=0))] (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)]) -> (%8635:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=317, solved=0))] (%8633:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=311)]) -> (%8636:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), )] (%8634:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)]) -> (%8634:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), )] (%8634:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)]) -> (%8637:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), )] (%8635:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)]) -> (%8635:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), )] (%8635:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)]) -> (%8638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), )] (%8636:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)]) -> (%8636:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), )] (%8636:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)]) -> (%8639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=320, solved=0))] (%8637:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=314)]) -> (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=322, solved=0))] (%8638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=316)]) -> (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.NegOp <name="model.layers.7.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8642:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8642:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8643:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8643:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8644:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8640:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8645:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), )] (%8645:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8644:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)]) -> (%8646:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.SliceOp <name="model.layers.7.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.NegOp <name="model.layers.7.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8647:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8647:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8648:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8648:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8649:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8641:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8650:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), )] (%8650:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)], %8649:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8651:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=323, solved=0), )] (%8651:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=321)]) -> (%8652:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=323)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=323, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), )] (%8652:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=323)]) -> (%8653:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), )] (%8653:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)]) -> (%8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=325, solved=0), )] (%8639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=318)]) -> (%8656:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=325)])
+            linalg.CPU.CastTypeOp <name="model.layers.7.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=325, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326, solved=0), )] (%8656:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=325)]) -> (%8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), )] (%8222:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)]) -> (%8659:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
+            linalg.CPU.ConcatOp <name="model.layers.7.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), )] (%8223:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)]) -> (%8660:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)])
+            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), )] (%8659:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8661:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)])
+            linalg.CPU.RepeatOp <name="model.layers.7.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), )] (%8660:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8662:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)])
+            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), )] (%8646:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=319)], %8661:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=10)]) -> (%8663:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)])
+            linalg.CPU.MulOp <name="model.layers.7.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=328, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), )] (%8663:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)], %8664:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=328), constant:[0.088388346]]) -> (%8665:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)])
+            linalg.CPU.ReduceMinOp <name="model.layers.7.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), )] (%8665:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)]) -> (%8666:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)])
+            linalg.CPU.AddOp <name="model.layers.7.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=330, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), )] (%8666:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)], %8667:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=330), constant:[-20]]) -> (%8668:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)])
+            linalg.CPU.EqualOp <name="model.layers.7.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=331, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=332, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8669:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=331), constant:[0]]) -> (%8670:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=332)])
+            linalg.CPU.WhereOp <name="model.layers.7.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=332, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), )] (%8670:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=332)], %8665:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=327)], %8668:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)]) -> (%8671:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)])
+            linalg.CPU.SoftmaxOp <name="model.layers.7.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333, solved=0), )] (%8671:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=329)]) -> (%8672:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333)])
+            linalg.CPU.MatMulOp <name="model.layers.7.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), )] (%8672:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=333)], %8662:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=38)]) -> (%8673:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)])
+            linalg.CPU.TransposeOp <name="model.layers.7.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), )] (%8673:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)]) -> (%8674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)])
+            linalg.CPU.ViewOp <name="model.layers.7.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), )] (%8674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)]) -> (%8674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)])
+            linalg.CPU.LinearOp <name="model.layers.7.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=335, solved=0))] (%8674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=334)]) -> (%8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)])
+            linalg.CPU.AddOp <name="model.layers.7.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8632:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8675:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=336)]) -> (%8676:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.7.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=338, solved=0))] (%8676:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=339, solved=0))] (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)]) -> (%8678:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=341, solved=0))] (%8677:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=337)]) -> (%8679:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)])
+            linalg.CPU.SigmoidOp <name="model.layers.7.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343, solved=0), )] (%8679:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)]) -> (%8680:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343)])
+            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), )] (%8679:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)], %8680:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=343)]) -> (%8681:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)])
+            linalg.CPU.MulOp <name="model.layers.7.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), )] (%8681:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)], %8678:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=340)]) -> (%8682:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)])
+            linalg.CPU.LinearOp <name="model.layers.7.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=344, solved=0))] (%8682:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=342)]) -> (%8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)])
+            linalg.CPU.AddOp <name="model.layers.7.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8676:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8683:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=345)]) -> (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=347, solved=0))] (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=348, solved=0))] (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)]) -> (%8686:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=350, solved=0))] (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)]) -> (%8687:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=352, solved=0))] (%8685:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=346)]) -> (%8688:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), )] (%8686:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)]) -> (%8686:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), )] (%8686:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)]) -> (%8689:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), )] (%8687:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)]) -> (%8687:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), )] (%8687:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)]) -> (%8690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), )] (%8688:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)]) -> (%8688:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), )] (%8688:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)]) -> (%8691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=355, solved=0))] (%8689:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=349)]) -> (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=357, solved=0))] (%8690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=351)]) -> (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.NegOp <name="model.layers.8.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8694:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8694:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8695:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8695:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8696:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8692:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8697:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), )] (%8697:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8696:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)]) -> (%8698:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.SliceOp <name="model.layers.8.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.NegOp <name="model.layers.8.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8699:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8699:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8700:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8700:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8701:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8693:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8702:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), )] (%8702:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)], %8701:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8703:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=358, solved=0), )] (%8703:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=356)]) -> (%8704:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=358)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=358, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), )] (%8704:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=358)]) -> (%8705:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), )] (%8705:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)]) -> (%8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=360, solved=0), )] (%8691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=353)]) -> (%8708:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=360)])
+            linalg.CPU.CastTypeOp <name="model.layers.8.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=360, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361, solved=0), )] (%8708:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=360)]) -> (%8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), )] (%8224:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)]) -> (%8711:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
+            linalg.CPU.ConcatOp <name="model.layers.8.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), )] (%8225:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)]) -> (%8712:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)])
+            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), )] (%8711:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8713:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)])
+            linalg.CPU.RepeatOp <name="model.layers.8.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), )] (%8712:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8714:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)])
+            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), )] (%8698:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=354)], %8713:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=11)]) -> (%8715:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)])
+            linalg.CPU.MulOp <name="model.layers.8.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=363, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), )] (%8715:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)], %8716:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=363), constant:[0.088388346]]) -> (%8717:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)])
+            linalg.CPU.ReduceMinOp <name="model.layers.8.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), )] (%8717:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)]) -> (%8718:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)])
+            linalg.CPU.AddOp <name="model.layers.8.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=365, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), )] (%8718:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)], %8719:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=365), constant:[-20]]) -> (%8720:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)])
+            linalg.CPU.EqualOp <name="model.layers.8.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=366, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=367, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8721:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=366), constant:[0]]) -> (%8722:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=367)])
+            linalg.CPU.WhereOp <name="model.layers.8.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=367, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), )] (%8722:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=367)], %8717:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=362)], %8720:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)]) -> (%8723:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)])
+            linalg.CPU.SoftmaxOp <name="model.layers.8.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368, solved=0), )] (%8723:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=364)]) -> (%8724:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368)])
+            linalg.CPU.MatMulOp <name="model.layers.8.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), )] (%8724:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=368)], %8714:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=39)]) -> (%8725:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)])
+            linalg.CPU.TransposeOp <name="model.layers.8.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), )] (%8725:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)]) -> (%8726:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)])
+            linalg.CPU.ViewOp <name="model.layers.8.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), )] (%8726:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)]) -> (%8726:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)])
+            linalg.CPU.LinearOp <name="model.layers.8.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=370, solved=0))] (%8726:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=369)]) -> (%8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)])
+            linalg.CPU.AddOp <name="model.layers.8.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8684:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8727:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=371)]) -> (%8728:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.8.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=373, solved=0))] (%8728:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=374, solved=0))] (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)]) -> (%8730:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=376, solved=0))] (%8729:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=372)]) -> (%8731:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)])
+            linalg.CPU.SigmoidOp <name="model.layers.8.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378, solved=0), )] (%8731:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)]) -> (%8732:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378)])
+            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), )] (%8731:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)], %8732:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=378)]) -> (%8733:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)])
+            linalg.CPU.MulOp <name="model.layers.8.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), )] (%8733:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)], %8730:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=375)]) -> (%8734:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)])
+            linalg.CPU.LinearOp <name="model.layers.8.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=379, solved=0))] (%8734:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=377)]) -> (%8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)])
+            linalg.CPU.AddOp <name="model.layers.8.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8728:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8735:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=380)]) -> (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=382, solved=0))] (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=383, solved=0))] (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)]) -> (%8738:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=385, solved=0))] (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)]) -> (%8739:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=387, solved=0))] (%8737:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=381)]) -> (%8740:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), )] (%8738:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)]) -> (%8738:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), )] (%8738:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)]) -> (%8741:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), )] (%8739:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)]) -> (%8739:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), )] (%8739:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)]) -> (%8742:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), )] (%8740:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)]) -> (%8740:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), )] (%8740:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)]) -> (%8743:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=390, solved=0))] (%8741:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=384)]) -> (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=392, solved=0))] (%8742:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=386)]) -> (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.NegOp <name="model.layers.9.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8746:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8746:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8747:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8747:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8748:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8744:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8749:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), )] (%8749:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8748:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)]) -> (%8750:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.SliceOp <name="model.layers.9.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.NegOp <name="model.layers.9.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8751:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8751:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8752:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8752:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8753:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8745:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8754:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), )] (%8754:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)], %8753:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8755:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=393, solved=0), )] (%8755:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=391)]) -> (%8756:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=393)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=393, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), )] (%8756:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=393)]) -> (%8757:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), )] (%8757:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)]) -> (%8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=395, solved=0), )] (%8743:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=388)]) -> (%8760:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=395)])
+            linalg.CPU.CastTypeOp <name="model.layers.9.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=395, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396, solved=0), )] (%8760:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=395)]) -> (%8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), )] (%8226:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)]) -> (%8763:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
+            linalg.CPU.ConcatOp <name="model.layers.9.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), )] (%8227:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)]) -> (%8764:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)])
+            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), )] (%8763:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8765:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)])
+            linalg.CPU.RepeatOp <name="model.layers.9.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), )] (%8764:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8766:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)])
+            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), )] (%8750:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=389)], %8765:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=12)]) -> (%8767:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)])
+            linalg.CPU.MulOp <name="model.layers.9.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=398, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), )] (%8767:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)], %8768:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=398), constant:[0.088388346]]) -> (%8769:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)])
+            linalg.CPU.ReduceMinOp <name="model.layers.9.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), )] (%8769:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)]) -> (%8770:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)])
+            linalg.CPU.AddOp <name="model.layers.9.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=400, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), )] (%8770:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)], %8771:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=400), constant:[-20]]) -> (%8772:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)])
+            linalg.CPU.EqualOp <name="model.layers.9.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=401, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=402, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8773:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=401), constant:[0]]) -> (%8774:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=402)])
+            linalg.CPU.WhereOp <name="model.layers.9.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=402, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), )] (%8774:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=402)], %8769:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=397)], %8772:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)]) -> (%8775:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)])
+            linalg.CPU.SoftmaxOp <name="model.layers.9.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403, solved=0), )] (%8775:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=399)]) -> (%8776:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403)])
+            linalg.CPU.MatMulOp <name="model.layers.9.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), )] (%8776:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=403)], %8766:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=40)]) -> (%8777:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)])
+            linalg.CPU.TransposeOp <name="model.layers.9.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), )] (%8777:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)]) -> (%8778:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)])
+            linalg.CPU.ViewOp <name="model.layers.9.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), )] (%8778:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)]) -> (%8778:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)])
+            linalg.CPU.LinearOp <name="model.layers.9.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=405, solved=0))] (%8778:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=404)]) -> (%8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)])
+            linalg.CPU.AddOp <name="model.layers.9.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8736:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8779:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=406)]) -> (%8780:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.9.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=408, solved=0))] (%8780:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=409, solved=0))] (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)]) -> (%8782:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=411, solved=0))] (%8781:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=407)]) -> (%8783:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)])
+            linalg.CPU.SigmoidOp <name="model.layers.9.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413, solved=0), )] (%8783:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)]) -> (%8784:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413)])
+            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), )] (%8783:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)], %8784:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=413)]) -> (%8785:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)])
+            linalg.CPU.MulOp <name="model.layers.9.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), )] (%8785:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)], %8782:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=410)]) -> (%8786:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)])
+            linalg.CPU.LinearOp <name="model.layers.9.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=414, solved=0))] (%8786:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=412)]) -> (%8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)])
+            linalg.CPU.AddOp <name="model.layers.9.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8780:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8787:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=415)]) -> (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=417, solved=0))] (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=418, solved=0))] (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)]) -> (%8790:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=420, solved=0))] (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)]) -> (%8791:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=422, solved=0))] (%8789:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=416)]) -> (%8792:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), )] (%8790:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)]) -> (%8790:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), )] (%8790:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)]) -> (%8793:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), )] (%8791:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)]) -> (%8791:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), )] (%8791:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)]) -> (%8794:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), )] (%8792:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)]) -> (%8792:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), )] (%8792:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)]) -> (%8795:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=425, solved=0))] (%8793:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=419)]) -> (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=427, solved=0))] (%8794:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=421)]) -> (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.NegOp <name="model.layers.10.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8798:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8798:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8799:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8799:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8800:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8796:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8801:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), )] (%8801:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8800:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)]) -> (%8802:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.SliceOp <name="model.layers.10.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.NegOp <name="model.layers.10.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8803:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8803:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8804:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8804:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8805:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8797:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8806:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), )] (%8806:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)], %8805:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8807:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=428, solved=0), )] (%8807:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=426)]) -> (%8808:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=428)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=428, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), )] (%8808:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=428)]) -> (%8809:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), )] (%8809:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)]) -> (%8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=430, solved=0), )] (%8795:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=423)]) -> (%8812:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=430)])
+            linalg.CPU.CastTypeOp <name="model.layers.10.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=430, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431, solved=0), )] (%8812:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=430)]) -> (%8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), )] (%8228:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)]) -> (%8815:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
+            linalg.CPU.ConcatOp <name="model.layers.10.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), )] (%8229:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)]) -> (%8816:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)])
+            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), )] (%8815:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8817:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)])
+            linalg.CPU.RepeatOp <name="model.layers.10.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), )] (%8816:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8818:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)])
+            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), )] (%8802:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=424)], %8817:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=13)]) -> (%8819:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)])
+            linalg.CPU.MulOp <name="model.layers.10.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=433, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), )] (%8819:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)], %8820:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=433), constant:[0.088388346]]) -> (%8821:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)])
+            linalg.CPU.ReduceMinOp <name="model.layers.10.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), )] (%8821:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)]) -> (%8822:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)])
+            linalg.CPU.AddOp <name="model.layers.10.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=435, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), )] (%8822:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)], %8823:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=435), constant:[-20]]) -> (%8824:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)])
+            linalg.CPU.EqualOp <name="model.layers.10.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=436, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=437, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8825:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=436), constant:[0]]) -> (%8826:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=437)])
+            linalg.CPU.WhereOp <name="model.layers.10.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=437, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), )] (%8826:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=437)], %8821:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=432)], %8824:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)]) -> (%8827:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)])
+            linalg.CPU.SoftmaxOp <name="model.layers.10.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438, solved=0), )] (%8827:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=434)]) -> (%8828:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438)])
+            linalg.CPU.MatMulOp <name="model.layers.10.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), )] (%8828:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=438)], %8818:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=41)]) -> (%8829:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)])
+            linalg.CPU.TransposeOp <name="model.layers.10.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), )] (%8829:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)]) -> (%8830:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)])
+            linalg.CPU.ViewOp <name="model.layers.10.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), )] (%8830:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)]) -> (%8830:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)])
+            linalg.CPU.LinearOp <name="model.layers.10.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=440, solved=0))] (%8830:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=439)]) -> (%8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)])
+            linalg.CPU.AddOp <name="model.layers.10.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8788:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8831:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=441)]) -> (%8832:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.10.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=443, solved=0))] (%8832:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=444, solved=0))] (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)]) -> (%8834:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=446, solved=0))] (%8833:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=442)]) -> (%8835:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)])
+            linalg.CPU.SigmoidOp <name="model.layers.10.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448, solved=0), )] (%8835:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)]) -> (%8836:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448)])
+            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), )] (%8835:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)], %8836:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=448)]) -> (%8837:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)])
+            linalg.CPU.MulOp <name="model.layers.10.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), )] (%8837:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)], %8834:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=445)]) -> (%8838:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)])
+            linalg.CPU.LinearOp <name="model.layers.10.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=449, solved=0))] (%8838:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=447)]) -> (%8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)])
+            linalg.CPU.AddOp <name="model.layers.10.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8832:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8839:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=450)]) -> (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=452, solved=0))] (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=453, solved=0))] (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)]) -> (%8842:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=455, solved=0))] (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)]) -> (%8843:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=457, solved=0))] (%8841:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=451)]) -> (%8844:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), )] (%8842:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)]) -> (%8842:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), )] (%8842:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)]) -> (%8845:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), )] (%8843:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)]) -> (%8843:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), )] (%8843:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)]) -> (%8846:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), )] (%8844:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)]) -> (%8844:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), )] (%8844:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)]) -> (%8847:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=460, solved=0))] (%8845:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=454)]) -> (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=462, solved=0))] (%8846:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=456)]) -> (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.NegOp <name="model.layers.11.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8850:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8850:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8851:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8851:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8852:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8848:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8853:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), )] (%8853:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8852:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)]) -> (%8854:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.SliceOp <name="model.layers.11.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.NegOp <name="model.layers.11.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8855:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8855:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8856:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8856:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8857:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8849:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8858:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), )] (%8858:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)], %8857:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8859:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=463, solved=0), )] (%8859:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=461)]) -> (%8860:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=463)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=463, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), )] (%8860:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=463)]) -> (%8861:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), )] (%8861:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)]) -> (%8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=465, solved=0), )] (%8847:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=458)]) -> (%8864:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=465)])
+            linalg.CPU.CastTypeOp <name="model.layers.11.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=465, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466, solved=0), )] (%8864:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=465)]) -> (%8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), )] (%8230:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)]) -> (%8867:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
+            linalg.CPU.ConcatOp <name="model.layers.11.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), )] (%8231:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)]) -> (%8868:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)])
+            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), )] (%8867:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8869:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)])
+            linalg.CPU.RepeatOp <name="model.layers.11.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), )] (%8868:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8870:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)])
+            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), )] (%8854:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=459)], %8869:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=14)]) -> (%8871:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)])
+            linalg.CPU.MulOp <name="model.layers.11.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=468, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), )] (%8871:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)], %8872:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=468), constant:[0.088388346]]) -> (%8873:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)])
+            linalg.CPU.ReduceMinOp <name="model.layers.11.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), )] (%8873:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)]) -> (%8874:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)])
+            linalg.CPU.AddOp <name="model.layers.11.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=470, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), )] (%8874:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)], %8875:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=470), constant:[-20]]) -> (%8876:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)])
+            linalg.CPU.EqualOp <name="model.layers.11.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=471, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=472, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8877:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=471), constant:[0]]) -> (%8878:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=472)])
+            linalg.CPU.WhereOp <name="model.layers.11.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=472, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), )] (%8878:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=472)], %8873:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=467)], %8876:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)]) -> (%8879:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)])
+            linalg.CPU.SoftmaxOp <name="model.layers.11.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473, solved=0), )] (%8879:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=469)]) -> (%8880:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473)])
+            linalg.CPU.MatMulOp <name="model.layers.11.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), )] (%8880:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=473)], %8870:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=42)]) -> (%8881:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)])
+            linalg.CPU.TransposeOp <name="model.layers.11.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), )] (%8881:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)]) -> (%8882:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)])
+            linalg.CPU.ViewOp <name="model.layers.11.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), )] (%8882:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)]) -> (%8882:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)])
+            linalg.CPU.LinearOp <name="model.layers.11.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=475, solved=0))] (%8882:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=474)]) -> (%8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)])
+            linalg.CPU.AddOp <name="model.layers.11.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8840:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8883:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=476)]) -> (%8884:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.11.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=478, solved=0))] (%8884:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=479, solved=0))] (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)]) -> (%8886:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=481, solved=0))] (%8885:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=477)]) -> (%8887:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)])
+            linalg.CPU.SigmoidOp <name="model.layers.11.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483, solved=0), )] (%8887:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)]) -> (%8888:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483)])
+            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), )] (%8887:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)], %8888:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=483)]) -> (%8889:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)])
+            linalg.CPU.MulOp <name="model.layers.11.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), )] (%8889:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)], %8886:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=480)]) -> (%8890:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)])
+            linalg.CPU.LinearOp <name="model.layers.11.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=484, solved=0))] (%8890:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=482)]) -> (%8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)])
+            linalg.CPU.AddOp <name="model.layers.11.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8884:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8891:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=485)]) -> (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=487, solved=0))] (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=488, solved=0))] (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)]) -> (%8894:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=490, solved=0))] (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)]) -> (%8895:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=492, solved=0))] (%8893:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=486)]) -> (%8896:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), )] (%8894:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)]) -> (%8894:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), )] (%8894:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)]) -> (%8897:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), )] (%8895:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)]) -> (%8895:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), )] (%8895:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)]) -> (%8898:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), )] (%8896:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)]) -> (%8896:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), )] (%8896:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)]) -> (%8899:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=495, solved=0))] (%8897:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=489)]) -> (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=497, solved=0))] (%8898:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=491)]) -> (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.NegOp <name="model.layers.12.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8902:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8902:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8903:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8903:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8904:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8900:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8905:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), )] (%8905:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8904:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)]) -> (%8906:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.SliceOp <name="model.layers.12.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.NegOp <name="model.layers.12.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8907:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8907:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8908:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8908:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8909:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8901:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8910:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), )] (%8910:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)], %8909:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8911:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=498, solved=0), )] (%8911:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=496)]) -> (%8912:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=498)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=498, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), )] (%8912:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=498)]) -> (%8913:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), )] (%8913:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)]) -> (%8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=500, solved=0), )] (%8899:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=493)]) -> (%8916:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=500)])
+            linalg.CPU.CastTypeOp <name="model.layers.12.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=500, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501, solved=0), )] (%8916:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=500)]) -> (%8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), )] (%8232:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)]) -> (%8919:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
+            linalg.CPU.ConcatOp <name="model.layers.12.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), )] (%8233:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)]) -> (%8920:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)])
+            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), )] (%8919:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8921:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)])
+            linalg.CPU.RepeatOp <name="model.layers.12.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), )] (%8920:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8922:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)])
+            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), )] (%8906:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=494)], %8921:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=15)]) -> (%8923:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)])
+            linalg.CPU.MulOp <name="model.layers.12.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=503, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), )] (%8923:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)], %8924:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=503), constant:[0.088388346]]) -> (%8925:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)])
+            linalg.CPU.ReduceMinOp <name="model.layers.12.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), )] (%8925:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)]) -> (%8926:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)])
+            linalg.CPU.AddOp <name="model.layers.12.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=505, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), )] (%8926:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)], %8927:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=505), constant:[-20]]) -> (%8928:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)])
+            linalg.CPU.EqualOp <name="model.layers.12.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=506, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=507, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8929:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=506), constant:[0]]) -> (%8930:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=507)])
+            linalg.CPU.WhereOp <name="model.layers.12.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=507, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), )] (%8930:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=507)], %8925:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=502)], %8928:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)]) -> (%8931:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)])
+            linalg.CPU.SoftmaxOp <name="model.layers.12.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508, solved=0), )] (%8931:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=504)]) -> (%8932:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508)])
+            linalg.CPU.MatMulOp <name="model.layers.12.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), )] (%8932:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=508)], %8922:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=43)]) -> (%8933:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)])
+            linalg.CPU.TransposeOp <name="model.layers.12.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), )] (%8933:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)]) -> (%8934:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)])
+            linalg.CPU.ViewOp <name="model.layers.12.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), )] (%8934:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)]) -> (%8934:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)])
+            linalg.CPU.LinearOp <name="model.layers.12.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=510, solved=0))] (%8934:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=509)]) -> (%8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)])
+            linalg.CPU.AddOp <name="model.layers.12.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8892:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8935:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=511)]) -> (%8936:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.12.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=513, solved=0))] (%8936:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=514, solved=0))] (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)]) -> (%8938:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=516, solved=0))] (%8937:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=512)]) -> (%8939:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)])
+            linalg.CPU.SigmoidOp <name="model.layers.12.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518, solved=0), )] (%8939:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)]) -> (%8940:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518)])
+            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), )] (%8939:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)], %8940:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=518)]) -> (%8941:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)])
+            linalg.CPU.MulOp <name="model.layers.12.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), )] (%8941:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)], %8938:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=515)]) -> (%8942:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)])
+            linalg.CPU.LinearOp <name="model.layers.12.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=519, solved=0))] (%8942:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=517)]) -> (%8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)])
+            linalg.CPU.AddOp <name="model.layers.12.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8936:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8943:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=520)]) -> (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=522, solved=0))] (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=523, solved=0))] (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)]) -> (%8946:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=525, solved=0))] (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)]) -> (%8947:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=527, solved=0))] (%8945:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=521)]) -> (%8948:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), )] (%8946:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)]) -> (%8946:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), )] (%8946:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)]) -> (%8949:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), )] (%8947:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)]) -> (%8947:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), )] (%8947:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)]) -> (%8950:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), )] (%8948:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)]) -> (%8948:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), )] (%8948:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)]) -> (%8951:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=530, solved=0))] (%8949:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=524)]) -> (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=532, solved=0))] (%8950:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=526)]) -> (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.NegOp <name="model.layers.13.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8954:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8954:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8955:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8955:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8956:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8952:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8957:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), )] (%8957:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8956:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)]) -> (%8958:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.SliceOp <name="model.layers.13.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.NegOp <name="model.layers.13.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8959:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8959:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8960:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8960:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8961:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8953:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8962:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), )] (%8962:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)], %8961:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8963:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=533, solved=0), )] (%8963:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=531)]) -> (%8964:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=533)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=533, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), )] (%8964:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=533)]) -> (%8965:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), )] (%8965:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)]) -> (%8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=535, solved=0), )] (%8951:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=528)]) -> (%8968:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=535)])
+            linalg.CPU.CastTypeOp <name="model.layers.13.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=535, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536, solved=0), )] (%8968:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=535)]) -> (%8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), )] (%8234:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)]) -> (%8971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
+            linalg.CPU.ConcatOp <name="model.layers.13.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), )] (%8235:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)]) -> (%8972:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)])
+            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), )] (%8971:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)])
+            linalg.CPU.RepeatOp <name="model.layers.13.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), )] (%8972:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8974:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)])
+            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), )] (%8958:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=529)], %8973:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=16)]) -> (%8975:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)])
+            linalg.CPU.MulOp <name="model.layers.13.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=538, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), )] (%8975:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)], %8976:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=538), constant:[0.088388346]]) -> (%8977:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)])
+            linalg.CPU.ReduceMinOp <name="model.layers.13.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), )] (%8977:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)]) -> (%8978:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)])
+            linalg.CPU.AddOp <name="model.layers.13.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=540, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), )] (%8978:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)], %8979:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=540), constant:[-20]]) -> (%8980:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)])
+            linalg.CPU.EqualOp <name="model.layers.13.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=541, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=542, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %8981:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=541), constant:[0]]) -> (%8982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=542)])
+            linalg.CPU.WhereOp <name="model.layers.13.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=542, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), )] (%8982:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=542)], %8977:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=537)], %8980:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)]) -> (%8983:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)])
+            linalg.CPU.SoftmaxOp <name="model.layers.13.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543, solved=0), )] (%8983:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=539)]) -> (%8984:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543)])
+            linalg.CPU.MatMulOp <name="model.layers.13.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), )] (%8984:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=543)], %8974:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=44)]) -> (%8985:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)])
+            linalg.CPU.TransposeOp <name="model.layers.13.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), )] (%8985:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)]) -> (%8986:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)])
+            linalg.CPU.ViewOp <name="model.layers.13.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), )] (%8986:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)]) -> (%8986:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)])
+            linalg.CPU.LinearOp <name="model.layers.13.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=545, solved=0))] (%8986:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=544)]) -> (%8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)])
+            linalg.CPU.AddOp <name="model.layers.13.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8944:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8987:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=546)]) -> (%8988:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.13.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=548, solved=0))] (%8988:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=549, solved=0))] (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)]) -> (%8990:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=551, solved=0))] (%8989:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=547)]) -> (%8991:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)])
+            linalg.CPU.SigmoidOp <name="model.layers.13.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553, solved=0), )] (%8991:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)]) -> (%8992:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553)])
+            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), )] (%8991:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)], %8992:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=553)]) -> (%8993:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)])
+            linalg.CPU.MulOp <name="model.layers.13.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), )] (%8993:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)], %8990:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=550)]) -> (%8994:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)])
+            linalg.CPU.LinearOp <name="model.layers.13.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=554, solved=0))] (%8994:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=552)]) -> (%8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)])
+            linalg.CPU.AddOp <name="model.layers.13.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8988:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %8995:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=555)]) -> (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=557, solved=0))] (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=558, solved=0))] (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)]) -> (%8998:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=560, solved=0))] (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)]) -> (%8999:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=562, solved=0))] (%8997:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=556)]) -> (%9000:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), )] (%8998:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)]) -> (%8998:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), )] (%8998:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)]) -> (%9001:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), )] (%8999:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)]) -> (%8999:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), )] (%8999:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)]) -> (%9002:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), )] (%9000:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)]) -> (%9000:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), )] (%9000:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)]) -> (%9003:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=565, solved=0))] (%9001:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=559)]) -> (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=567, solved=0))] (%9002:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=561)]) -> (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.NegOp <name="model.layers.14.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9006:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9006:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9007:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9007:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9008:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9004:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9009:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), )] (%9009:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %9008:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)]) -> (%9010:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.SliceOp <name="model.layers.14.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.NegOp <name="model.layers.14.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9011:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9011:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9012:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9012:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9013:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9005:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9014:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), )] (%9014:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)], %9013:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9015:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=568, solved=0), )] (%9015:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=566)]) -> (%9016:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=568)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=568, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), )] (%9016:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=568)]) -> (%9017:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), )] (%9017:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)]) -> (%9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=570, solved=0), )] (%9003:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=563)]) -> (%9020:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=570)])
+            linalg.CPU.CastTypeOp <name="model.layers.14.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=570, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571, solved=0), )] (%9020:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=570)]) -> (%9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), )] (%8236:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)]) -> (%9023:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
+            linalg.CPU.ConcatOp <name="model.layers.14.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), )] (%8237:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)]) -> (%9024:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)])
+            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), )] (%9023:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%9025:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)])
+            linalg.CPU.RepeatOp <name="model.layers.14.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), )] (%9024:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9026:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)])
+            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), )] (%9010:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=564)], %9025:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=17)]) -> (%9027:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)])
+            linalg.CPU.MulOp <name="model.layers.14.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=573, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), )] (%9027:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)], %9028:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=573), constant:[0.088388346]]) -> (%9029:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)])
+            linalg.CPU.ReduceMinOp <name="model.layers.14.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), )] (%9029:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)]) -> (%9030:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)])
+            linalg.CPU.AddOp <name="model.layers.14.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=575, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), )] (%9030:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)], %9031:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=575), constant:[-20]]) -> (%9032:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)])
+            linalg.CPU.EqualOp <name="model.layers.14.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=576, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=577, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9033:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=576), constant:[0]]) -> (%9034:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=577)])
+            linalg.CPU.WhereOp <name="model.layers.14.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=577, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), )] (%9034:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=577)], %9029:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=572)], %9032:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)]) -> (%9035:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)])
+            linalg.CPU.SoftmaxOp <name="model.layers.14.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578, solved=0), )] (%9035:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=574)]) -> (%9036:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578)])
+            linalg.CPU.MatMulOp <name="model.layers.14.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), )] (%9036:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=578)], %9026:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=45)]) -> (%9037:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)])
+            linalg.CPU.TransposeOp <name="model.layers.14.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), )] (%9037:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)]) -> (%9038:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)])
+            linalg.CPU.ViewOp <name="model.layers.14.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), )] (%9038:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)]) -> (%9038:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)])
+            linalg.CPU.LinearOp <name="model.layers.14.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=580, solved=0))] (%9038:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=579)]) -> (%9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)])
+            linalg.CPU.AddOp <name="model.layers.14.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%8996:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9039:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=581)]) -> (%9040:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.14.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=583, solved=0))] (%9040:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=584, solved=0))] (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)]) -> (%9042:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=586, solved=0))] (%9041:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=582)]) -> (%9043:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)])
+            linalg.CPU.SigmoidOp <name="model.layers.14.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588, solved=0), )] (%9043:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)]) -> (%9044:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588)])
+            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), )] (%9043:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)], %9044:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=588)]) -> (%9045:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)])
+            linalg.CPU.MulOp <name="model.layers.14.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), )] (%9045:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)], %9042:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=585)]) -> (%9046:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)])
+            linalg.CPU.LinearOp <name="model.layers.14.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=589, solved=0))] (%9046:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=587)]) -> (%9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)])
+            linalg.CPU.AddOp <name="model.layers.14.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9040:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9047:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=590)]) -> (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=592, solved=0))] (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=593, solved=0))] (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)]) -> (%9050:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=595, solved=0))] (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)]) -> (%9051:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=597, solved=0))] (%9049:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=591)]) -> (%9052:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), )] (%9050:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)]) -> (%9050:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), )] (%9050:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)]) -> (%9053:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), )] (%9051:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)]) -> (%9051:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), )] (%9051:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)]) -> (%9054:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), )] (%9052:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)]) -> (%9052:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), )] (%9052:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)]) -> (%9055:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=600, solved=0))] (%9053:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=594)]) -> (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=602, solved=0))] (%9054:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=596)]) -> (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.NegOp <name="model.layers.15.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9058:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9058:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9059:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9059:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9060:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9056:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9061:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), )] (%9061:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %9060:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)]) -> (%9062:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.SliceOp <name="model.layers.15.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.NegOp <name="model.layers.15.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9063:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9063:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9064:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9064:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9065:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9057:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9066:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), )] (%9066:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)], %9065:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9067:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=603, solved=0), )] (%9067:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=601)]) -> (%9068:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=603)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=603, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), )] (%9068:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=603)]) -> (%9069:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), )] (%9069:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)]) -> (%9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=605, solved=0), )] (%9055:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=598)]) -> (%9072:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=605)])
+            linalg.CPU.CastTypeOp <name="model.layers.15.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=605, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606, solved=0), )] (%9072:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=605)]) -> (%9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), )] (%8238:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)]) -> (%9075:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
+            linalg.CPU.ConcatOp <name="model.layers.15.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), )] (%8239:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)]) -> (%9076:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)])
+            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), )] (%9075:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%9077:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)])
+            linalg.CPU.RepeatOp <name="model.layers.15.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), )] (%9076:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9078:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)])
+            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), )] (%9062:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=599)], %9077:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=18)]) -> (%9079:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)])
+            linalg.CPU.MulOp <name="model.layers.15.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=608, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), )] (%9079:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)], %9080:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=608), constant:[0.088388346]]) -> (%9081:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)])
+            linalg.CPU.ReduceMinOp <name="model.layers.15.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), )] (%9081:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)]) -> (%9082:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)])
+            linalg.CPU.AddOp <name="model.layers.15.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=610, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), )] (%9082:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)], %9083:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=610), constant:[-20]]) -> (%9084:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)])
+            linalg.CPU.EqualOp <name="model.layers.15.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=611, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=612, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9085:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=611), constant:[0]]) -> (%9086:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=612)])
+            linalg.CPU.WhereOp <name="model.layers.15.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=612, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), )] (%9086:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=612)], %9081:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=607)], %9084:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)]) -> (%9087:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)])
+            linalg.CPU.SoftmaxOp <name="model.layers.15.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613, solved=0), )] (%9087:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=609)]) -> (%9088:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613)])
+            linalg.CPU.MatMulOp <name="model.layers.15.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), )] (%9088:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=613)], %9078:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=46)]) -> (%9089:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)])
+            linalg.CPU.TransposeOp <name="model.layers.15.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), )] (%9089:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)]) -> (%9090:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)])
+            linalg.CPU.ViewOp <name="model.layers.15.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), )] (%9090:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)]) -> (%9090:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)])
+            linalg.CPU.LinearOp <name="model.layers.15.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=615, solved=0))] (%9090:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=614)]) -> (%9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)])
+            linalg.CPU.AddOp <name="model.layers.15.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9048:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9091:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=616)]) -> (%9092:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.15.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=618, solved=0))] (%9092:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=619, solved=0))] (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)]) -> (%9094:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=621, solved=0))] (%9093:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=617)]) -> (%9095:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)])
+            linalg.CPU.SigmoidOp <name="model.layers.15.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623, solved=0), )] (%9095:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)]) -> (%9096:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623)])
+            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), )] (%9095:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)], %9096:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=623)]) -> (%9097:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)])
+            linalg.CPU.MulOp <name="model.layers.15.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), )] (%9097:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)], %9094:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=620)]) -> (%9098:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)])
+            linalg.CPU.LinearOp <name="model.layers.15.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=624, solved=0))] (%9098:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=622)]) -> (%9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)])
+            linalg.CPU.AddOp <name="model.layers.15.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9092:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9099:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=625)]) -> (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=627, solved=0))] (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=628, solved=0))] (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)]) -> (%9102:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=630, solved=0))] (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)]) -> (%9103:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=632, solved=0))] (%9101:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=626)]) -> (%9104:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), )] (%9102:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)]) -> (%9102:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), )] (%9102:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)]) -> (%9105:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), )] (%9103:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)]) -> (%9103:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), )] (%9103:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)]) -> (%9106:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), )] (%9104:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)]) -> (%9104:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), )] (%9104:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)]) -> (%9107:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=635, solved=0))] (%9105:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=629)]) -> (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=637, solved=0))] (%9106:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=631)]) -> (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.NegOp <name="model.layers.16.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9110:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9110:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9111:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9111:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9112:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9108:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9113:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), )] (%9113:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %9112:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)]) -> (%9114:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.SliceOp <name="model.layers.16.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.NegOp <name="model.layers.16.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9115:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9115:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9116:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9116:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9117:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9109:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9118:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), )] (%9118:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)], %9117:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9119:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=638, solved=0), )] (%9119:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=636)]) -> (%9120:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=638)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=638, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), )] (%9120:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=638)]) -> (%9121:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), )] (%9121:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)]) -> (%9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=640, solved=0), )] (%9107:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=633)]) -> (%9124:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=640)])
+            linalg.CPU.CastTypeOp <name="model.layers.16.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=640, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641, solved=0), )] (%9124:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=640)]) -> (%9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), )] (%8240:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)]) -> (%9127:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
+            linalg.CPU.ConcatOp <name="model.layers.16.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), )] (%8241:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)]) -> (%9128:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)])
+            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), )] (%9127:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%9129:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)])
+            linalg.CPU.RepeatOp <name="model.layers.16.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), )] (%9128:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9130:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)])
+            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), )] (%9114:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=634)], %9129:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=19)]) -> (%9131:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)])
+            linalg.CPU.MulOp <name="model.layers.16.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=643, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), )] (%9131:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)], %9132:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=643), constant:[0.088388346]]) -> (%9133:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)])
+            linalg.CPU.ReduceMinOp <name="model.layers.16.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), )] (%9133:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)]) -> (%9134:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)])
+            linalg.CPU.AddOp <name="model.layers.16.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=645, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), )] (%9134:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)], %9135:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=645), constant:[-20]]) -> (%9136:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)])
+            linalg.CPU.EqualOp <name="model.layers.16.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=646, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=647, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9137:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=646), constant:[0]]) -> (%9138:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=647)])
+            linalg.CPU.WhereOp <name="model.layers.16.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=647, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), )] (%9138:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=647)], %9133:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=642)], %9136:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)]) -> (%9139:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)])
+            linalg.CPU.SoftmaxOp <name="model.layers.16.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648, solved=0), )] (%9139:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=644)]) -> (%9140:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648)])
+            linalg.CPU.MatMulOp <name="model.layers.16.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), )] (%9140:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=648)], %9130:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=47)]) -> (%9141:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)])
+            linalg.CPU.TransposeOp <name="model.layers.16.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), )] (%9141:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)]) -> (%9142:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)])
+            linalg.CPU.ViewOp <name="model.layers.16.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), )] (%9142:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)]) -> (%9142:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)])
+            linalg.CPU.LinearOp <name="model.layers.16.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=650, solved=0))] (%9142:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=649)]) -> (%9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)])
+            linalg.CPU.AddOp <name="model.layers.16.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9100:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9143:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=651)]) -> (%9144:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.16.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=653, solved=0))] (%9144:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=654, solved=0))] (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)]) -> (%9146:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=656, solved=0))] (%9145:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=652)]) -> (%9147:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)])
+            linalg.CPU.SigmoidOp <name="model.layers.16.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658, solved=0), )] (%9147:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)]) -> (%9148:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658)])
+            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), )] (%9147:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)], %9148:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=658)]) -> (%9149:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)])
+            linalg.CPU.MulOp <name="model.layers.16.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), )] (%9149:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)], %9146:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=655)]) -> (%9150:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)])
+            linalg.CPU.LinearOp <name="model.layers.16.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=659, solved=0))] (%9150:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=657)]) -> (%9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)])
+            linalg.CPU.AddOp <name="model.layers.16.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9144:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9151:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=660)]) -> (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=662, solved=0))] (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=663, solved=0))] (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)]) -> (%9154:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=665, solved=0))] (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)]) -> (%9155:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=667, solved=0))] (%9153:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=661)]) -> (%9156:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), )] (%9154:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)]) -> (%9154:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), )] (%9154:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)]) -> (%9157:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), )] (%9155:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)]) -> (%9155:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), )] (%9155:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)]) -> (%9158:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), )] (%9156:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)]) -> (%9156:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), )] (%9156:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)]) -> (%9159:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=670, solved=0))] (%9157:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=664)]) -> (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=672, solved=0))] (%9158:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=666)]) -> (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.NegOp <name="model.layers.17.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9162:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9162:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9163:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9163:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9164:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9160:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9165:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), )] (%9165:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %9164:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)]) -> (%9166:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.SliceOp <name="model.layers.17.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.NegOp <name="model.layers.17.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9167:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9167:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9168:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9168:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9169:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9161:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9170:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), )] (%9170:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)], %9169:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9171:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=673, solved=0), )] (%9171:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=671)]) -> (%9172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=673)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=673, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), )] (%9172:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=673)]) -> (%9173:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), )] (%9173:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)]) -> (%9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=675, solved=0), )] (%9159:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=668)]) -> (%9176:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=675)])
+            linalg.CPU.CastTypeOp <name="model.layers.17.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=675, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676, solved=0), )] (%9176:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=675)]) -> (%9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), )] (%8242:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)]) -> (%9179:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
+            linalg.CPU.ConcatOp <name="model.layers.17.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), )] (%8243:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)]) -> (%9180:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)])
+            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), )] (%9179:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%9181:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)])
+            linalg.CPU.RepeatOp <name="model.layers.17.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), )] (%9180:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9182:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)])
+            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), )] (%9166:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=669)], %9181:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=20)]) -> (%9183:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)])
+            linalg.CPU.MulOp <name="model.layers.17.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=678, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), )] (%9183:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)], %9184:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=678), constant:[0.088388346]]) -> (%9185:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)])
+            linalg.CPU.ReduceMinOp <name="model.layers.17.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), )] (%9185:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)]) -> (%9186:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)])
+            linalg.CPU.AddOp <name="model.layers.17.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=680, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), )] (%9186:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)], %9187:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=680), constant:[-20]]) -> (%9188:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)])
+            linalg.CPU.EqualOp <name="model.layers.17.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=681, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=682, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9189:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=681), constant:[0]]) -> (%9190:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=682)])
+            linalg.CPU.WhereOp <name="model.layers.17.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=682, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), )] (%9190:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=682)], %9185:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=677)], %9188:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)]) -> (%9191:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)])
+            linalg.CPU.SoftmaxOp <name="model.layers.17.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683, solved=0), )] (%9191:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=679)]) -> (%9192:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683)])
+            linalg.CPU.MatMulOp <name="model.layers.17.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), )] (%9192:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=683)], %9182:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=48)]) -> (%9193:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)])
+            linalg.CPU.TransposeOp <name="model.layers.17.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), )] (%9193:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)]) -> (%9194:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)])
+            linalg.CPU.ViewOp <name="model.layers.17.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), )] (%9194:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)]) -> (%9194:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)])
+            linalg.CPU.LinearOp <name="model.layers.17.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=685, solved=0))] (%9194:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=684)]) -> (%9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)])
+            linalg.CPU.AddOp <name="model.layers.17.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9152:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9195:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=686)]) -> (%9196:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.17.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=688, solved=0))] (%9196:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=689, solved=0))] (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)]) -> (%9198:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=691, solved=0))] (%9197:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=687)]) -> (%9199:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)])
+            linalg.CPU.SigmoidOp <name="model.layers.17.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693, solved=0), )] (%9199:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)]) -> (%9200:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693)])
+            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), )] (%9199:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)], %9200:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=693)]) -> (%9201:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)])
+            linalg.CPU.MulOp <name="model.layers.17.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), )] (%9201:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)], %9198:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=690)]) -> (%9202:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)])
+            linalg.CPU.LinearOp <name="model.layers.17.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=694, solved=0))] (%9202:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=692)]) -> (%9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)])
+            linalg.CPU.AddOp <name="model.layers.17.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9196:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9203:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=695)]) -> (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=697, solved=0))] (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=698, solved=0))] (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)]) -> (%9206:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=700, solved=0))] (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)]) -> (%9207:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=702, solved=0))] (%9205:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=696)]) -> (%9208:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), )] (%9206:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)]) -> (%9206:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), )] (%9206:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)]) -> (%9209:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), )] (%9207:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)]) -> (%9207:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), )] (%9207:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)]) -> (%9210:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), )] (%9208:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)]) -> (%9208:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), )] (%9208:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)]) -> (%9211:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=705, solved=0))] (%9209:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=699)]) -> (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=707, solved=0))] (%9210:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=701)]) -> (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.NegOp <name="model.layers.18.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9214:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9214:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9215:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9215:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9216:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9212:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9217:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), )] (%9217:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %9216:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)]) -> (%9218:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.SliceOp <name="model.layers.18.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.NegOp <name="model.layers.18.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9219:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9219:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9220:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9220:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9221:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9213:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9222:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), )] (%9222:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)], %9221:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9223:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=708, solved=0), )] (%9223:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=706)]) -> (%9224:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=708)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=708, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), )] (%9224:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=708)]) -> (%9225:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), )] (%9225:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)]) -> (%9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=710, solved=0), )] (%9211:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=703)]) -> (%9228:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=710)])
+            linalg.CPU.CastTypeOp <name="model.layers.18.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=710, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711, solved=0), )] (%9228:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=710)]) -> (%9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), )] (%8244:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)]) -> (%9231:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
+            linalg.CPU.ConcatOp <name="model.layers.18.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), )] (%8245:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)]) -> (%9232:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)])
+            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), )] (%9231:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%9233:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)])
+            linalg.CPU.RepeatOp <name="model.layers.18.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), )] (%9232:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9234:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)])
+            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), )] (%9218:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=704)], %9233:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=21)]) -> (%9235:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)])
+            linalg.CPU.MulOp <name="model.layers.18.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=713, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), )] (%9235:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)], %9236:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=713), constant:[0.088388346]]) -> (%9237:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)])
+            linalg.CPU.ReduceMinOp <name="model.layers.18.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), )] (%9237:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)]) -> (%9238:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)])
+            linalg.CPU.AddOp <name="model.layers.18.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=715, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), )] (%9238:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)], %9239:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=715), constant:[-20]]) -> (%9240:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)])
+            linalg.CPU.EqualOp <name="model.layers.18.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=716, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=717, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9241:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=716), constant:[0]]) -> (%9242:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=717)])
+            linalg.CPU.WhereOp <name="model.layers.18.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=717, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), )] (%9242:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=717)], %9237:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=712)], %9240:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)]) -> (%9243:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)])
+            linalg.CPU.SoftmaxOp <name="model.layers.18.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718, solved=0), )] (%9243:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=714)]) -> (%9244:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718)])
+            linalg.CPU.MatMulOp <name="model.layers.18.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), )] (%9244:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=718)], %9234:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=49)]) -> (%9245:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)])
+            linalg.CPU.TransposeOp <name="model.layers.18.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), )] (%9245:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)]) -> (%9246:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)])
+            linalg.CPU.ViewOp <name="model.layers.18.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), )] (%9246:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)]) -> (%9246:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)])
+            linalg.CPU.LinearOp <name="model.layers.18.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=720, solved=0))] (%9246:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=719)]) -> (%9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)])
+            linalg.CPU.AddOp <name="model.layers.18.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9204:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9247:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=721)]) -> (%9248:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.18.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=723, solved=0))] (%9248:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=724, solved=0))] (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)]) -> (%9250:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=726, solved=0))] (%9249:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=722)]) -> (%9251:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)])
+            linalg.CPU.SigmoidOp <name="model.layers.18.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728, solved=0), )] (%9251:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)]) -> (%9252:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728)])
+            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), )] (%9251:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)], %9252:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=728)]) -> (%9253:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)])
+            linalg.CPU.MulOp <name="model.layers.18.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), )] (%9253:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)], %9250:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=725)]) -> (%9254:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)])
+            linalg.CPU.LinearOp <name="model.layers.18.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=729, solved=0))] (%9254:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=727)]) -> (%9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)])
+            linalg.CPU.AddOp <name="model.layers.18.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9248:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9255:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=730)]) -> (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=732, solved=0))] (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=733, solved=0))] (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)]) -> (%9258:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=735, solved=0))] (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)]) -> (%9259:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=737, solved=0))] (%9257:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=731)]) -> (%9260:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), )] (%9258:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)]) -> (%9258:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), )] (%9258:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)]) -> (%9261:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), )] (%9259:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)]) -> (%9259:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), )] (%9259:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)]) -> (%9262:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), )] (%9260:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)]) -> (%9260:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), )] (%9260:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)]) -> (%9263:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=740, solved=0))] (%9261:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=734)]) -> (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=742, solved=0))] (%9262:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=736)]) -> (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.NegOp <name="model.layers.19.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9266:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9266:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9267:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9267:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9268:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9264:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9269:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), )] (%9269:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %9268:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)]) -> (%9270:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.SliceOp <name="model.layers.19.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.NegOp <name="model.layers.19.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9271:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9271:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9272:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9272:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9273:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9265:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), )] (%9274:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)], %9273:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=743, solved=0), )] (%9275:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=741)]) -> (%9276:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=743)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=743, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), )] (%9276:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=743)]) -> (%9277:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), )] (%9277:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)]) -> (%9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=745, solved=0), )] (%9263:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=738)]) -> (%9280:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=745)])
+            linalg.CPU.CastTypeOp <name="model.layers.19.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=745, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746, solved=0), )] (%9280:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=745)]) -> (%9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), )] (%8246:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)]) -> (%9283:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
+            linalg.CPU.ConcatOp <name="model.layers.19.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), )] (%8247:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)]) -> (%9284:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)])
+            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), )] (%9283:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%9285:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)])
+            linalg.CPU.RepeatOp <name="model.layers.19.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), )] (%9284:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9286:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)])
+            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), )] (%9270:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=739)], %9285:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=22)]) -> (%9287:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)])
+            linalg.CPU.MulOp <name="model.layers.19.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=748, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), )] (%9287:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)], %9288:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=748), constant:[0.088388346]]) -> (%9289:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)])
+            linalg.CPU.ReduceMinOp <name="model.layers.19.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), )] (%9289:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)]) -> (%9290:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)])
+            linalg.CPU.AddOp <name="model.layers.19.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=750, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), )] (%9290:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)], %9291:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=750), constant:[-20]]) -> (%9292:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)])
+            linalg.CPU.EqualOp <name="model.layers.19.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=751, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=752, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9293:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=751), constant:[0]]) -> (%9294:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=752)])
+            linalg.CPU.WhereOp <name="model.layers.19.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=752, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), )] (%9294:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=752)], %9289:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=747)], %9292:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)]) -> (%9295:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)])
+            linalg.CPU.SoftmaxOp <name="model.layers.19.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753, solved=0), )] (%9295:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=749)]) -> (%9296:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753)])
+            linalg.CPU.MatMulOp <name="model.layers.19.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), )] (%9296:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=753)], %9286:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=50)]) -> (%9297:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)])
+            linalg.CPU.TransposeOp <name="model.layers.19.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), )] (%9297:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)]) -> (%9298:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)])
+            linalg.CPU.ViewOp <name="model.layers.19.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), )] (%9298:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)]) -> (%9298:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)])
+            linalg.CPU.LinearOp <name="model.layers.19.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=755, solved=0))] (%9298:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=754)]) -> (%9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)])
+            linalg.CPU.AddOp <name="model.layers.19.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9256:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9299:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=756)]) -> (%9300:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.19.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=758, solved=0))] (%9300:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=759, solved=0))] (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)]) -> (%9302:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=761, solved=0))] (%9301:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=757)]) -> (%9303:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)])
+            linalg.CPU.SigmoidOp <name="model.layers.19.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763, solved=0), )] (%9303:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)]) -> (%9304:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763)])
+            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), )] (%9303:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)], %9304:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=763)]) -> (%9305:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)])
+            linalg.CPU.MulOp <name="model.layers.19.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), )] (%9305:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)], %9302:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=760)]) -> (%9306:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)])
+            linalg.CPU.LinearOp <name="model.layers.19.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=764, solved=0))] (%9306:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=762)]) -> (%9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)])
+            linalg.CPU.AddOp <name="model.layers.19.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9300:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9307:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=765)]) -> (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=767, solved=0))] (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=768, solved=0))] (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)]) -> (%9310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=770, solved=0))] (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)]) -> (%9311:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=772, solved=0))] (%9309:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=766)]) -> (%9312:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), )] (%9310:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)]) -> (%9310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), )] (%9310:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)]) -> (%9313:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), )] (%9311:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)]) -> (%9311:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), )] (%9311:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)]) -> (%9314:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), )] (%9312:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)]) -> (%9312:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), )] (%9312:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)]) -> (%9315:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=775, solved=0))] (%9313:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=769)]) -> (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=777, solved=0))] (%9314:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=771)]) -> (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.NegOp <name="model.layers.20.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9318:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9318:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9319:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9319:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9320:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9316:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9321:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), )] (%9321:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %9320:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)]) -> (%9322:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.SliceOp <name="model.layers.20.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.NegOp <name="model.layers.20.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9323:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9323:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9324:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9324:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9325:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9317:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), )] (%9326:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)], %9325:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=778, solved=0), )] (%9327:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=776)]) -> (%9328:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=778)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=778, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), )] (%9328:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=778)]) -> (%9329:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), )] (%9329:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)]) -> (%9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=780, solved=0), )] (%9315:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=773)]) -> (%9332:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=780)])
+            linalg.CPU.CastTypeOp <name="model.layers.20.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=780, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781, solved=0), )] (%9332:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=780)]) -> (%9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), )] (%8248:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)]) -> (%9335:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
+            linalg.CPU.ConcatOp <name="model.layers.20.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), )] (%8249:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)]) -> (%9336:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)])
+            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), )] (%9335:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%9337:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)])
+            linalg.CPU.RepeatOp <name="model.layers.20.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), )] (%9336:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9338:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)])
+            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), )] (%9322:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=774)], %9337:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=23)]) -> (%9339:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)])
+            linalg.CPU.MulOp <name="model.layers.20.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=783, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), )] (%9339:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)], %9340:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=783), constant:[0.088388346]]) -> (%9341:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)])
+            linalg.CPU.ReduceMinOp <name="model.layers.20.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), )] (%9341:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)]) -> (%9342:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)])
+            linalg.CPU.AddOp <name="model.layers.20.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=785, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), )] (%9342:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)], %9343:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=785), constant:[-20]]) -> (%9344:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)])
+            linalg.CPU.EqualOp <name="model.layers.20.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=786, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=787, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9345:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=786), constant:[0]]) -> (%9346:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=787)])
+            linalg.CPU.WhereOp <name="model.layers.20.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=787, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), )] (%9346:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=787)], %9341:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=782)], %9344:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)]) -> (%9347:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)])
+            linalg.CPU.SoftmaxOp <name="model.layers.20.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788, solved=0), )] (%9347:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=784)]) -> (%9348:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788)])
+            linalg.CPU.MatMulOp <name="model.layers.20.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), )] (%9348:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=788)], %9338:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=51)]) -> (%9349:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)])
+            linalg.CPU.TransposeOp <name="model.layers.20.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), )] (%9349:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)]) -> (%9350:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)])
+            linalg.CPU.ViewOp <name="model.layers.20.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), )] (%9350:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)]) -> (%9350:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)])
+            linalg.CPU.LinearOp <name="model.layers.20.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=790, solved=0))] (%9350:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=789)]) -> (%9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)])
+            linalg.CPU.AddOp <name="model.layers.20.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9308:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9351:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=791)]) -> (%9352:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.20.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=793, solved=0))] (%9352:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=794, solved=0))] (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)]) -> (%9354:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=796, solved=0))] (%9353:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=792)]) -> (%9355:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)])
+            linalg.CPU.SigmoidOp <name="model.layers.20.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798, solved=0), )] (%9355:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)]) -> (%9356:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798)])
+            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), )] (%9355:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)], %9356:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=798)]) -> (%9357:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)])
+            linalg.CPU.MulOp <name="model.layers.20.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), )] (%9357:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)], %9354:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=795)]) -> (%9358:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)])
+            linalg.CPU.LinearOp <name="model.layers.20.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=799, solved=0))] (%9358:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=797)]) -> (%9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)])
+            linalg.CPU.AddOp <name="model.layers.20.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9352:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9359:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=800)]) -> (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=802, solved=0))] (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=803, solved=0))] (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)]) -> (%9362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=805, solved=0))] (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)]) -> (%9363:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=807, solved=0))] (%9361:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=801)]) -> (%9364:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), )] (%9362:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)]) -> (%9362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), )] (%9362:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)]) -> (%9365:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), )] (%9363:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)]) -> (%9363:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), )] (%9363:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)]) -> (%9366:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), )] (%9364:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)]) -> (%9364:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), )] (%9364:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)]) -> (%9367:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=810, solved=0))] (%9365:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=804)]) -> (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=812, solved=0))] (%9366:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=806)]) -> (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.NegOp <name="model.layers.21.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9370:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9370:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9371:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9371:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9372:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9368:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9373:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), )] (%9373:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %9372:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)]) -> (%9374:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.SliceOp <name="model.layers.21.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.NegOp <name="model.layers.21.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9375:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9375:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9376:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9376:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9377:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9369:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), )] (%9378:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)], %9377:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=813, solved=0), )] (%9379:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=811)]) -> (%9380:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=813)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=813, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), )] (%9380:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=813)]) -> (%9381:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), )] (%9381:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)]) -> (%9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=815, solved=0), )] (%9367:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=808)]) -> (%9384:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=815)])
+            linalg.CPU.CastTypeOp <name="model.layers.21.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=815, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816, solved=0), )] (%9384:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=815)]) -> (%9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), )] (%8250:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)]) -> (%9387:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
+            linalg.CPU.ConcatOp <name="model.layers.21.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), )] (%8251:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)]) -> (%9388:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)])
+            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), )] (%9387:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%9389:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)])
+            linalg.CPU.RepeatOp <name="model.layers.21.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), )] (%9388:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9390:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)])
+            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), )] (%9374:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=809)], %9389:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=24)]) -> (%9391:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)])
+            linalg.CPU.MulOp <name="model.layers.21.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=818, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), )] (%9391:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)], %9392:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=818), constant:[0.088388346]]) -> (%9393:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)])
+            linalg.CPU.ReduceMinOp <name="model.layers.21.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), )] (%9393:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)]) -> (%9394:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)])
+            linalg.CPU.AddOp <name="model.layers.21.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=820, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), )] (%9394:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)], %9395:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=820), constant:[-20]]) -> (%9396:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)])
+            linalg.CPU.EqualOp <name="model.layers.21.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=821, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=822, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9397:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=821), constant:[0]]) -> (%9398:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=822)])
+            linalg.CPU.WhereOp <name="model.layers.21.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=822, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), )] (%9398:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=822)], %9393:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=817)], %9396:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)]) -> (%9399:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)])
+            linalg.CPU.SoftmaxOp <name="model.layers.21.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823, solved=0), )] (%9399:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=819)]) -> (%9400:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823)])
+            linalg.CPU.MatMulOp <name="model.layers.21.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), )] (%9400:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=823)], %9390:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=52)]) -> (%9401:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)])
+            linalg.CPU.TransposeOp <name="model.layers.21.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), )] (%9401:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)]) -> (%9402:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)])
+            linalg.CPU.ViewOp <name="model.layers.21.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), )] (%9402:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)]) -> (%9402:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)])
+            linalg.CPU.LinearOp <name="model.layers.21.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=825, solved=0))] (%9402:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=824)]) -> (%9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)])
+            linalg.CPU.AddOp <name="model.layers.21.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9360:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9403:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=826)]) -> (%9404:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.21.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=828, solved=0))] (%9404:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=829, solved=0))] (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)]) -> (%9406:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=831, solved=0))] (%9405:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=827)]) -> (%9407:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)])
+            linalg.CPU.SigmoidOp <name="model.layers.21.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833, solved=0), )] (%9407:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)]) -> (%9408:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833)])
+            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), )] (%9407:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)], %9408:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=833)]) -> (%9409:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)])
+            linalg.CPU.MulOp <name="model.layers.21.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), )] (%9409:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)], %9406:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=830)]) -> (%9410:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)])
+            linalg.CPU.LinearOp <name="model.layers.21.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=834, solved=0))] (%9410:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=832)]) -> (%9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)])
+            linalg.CPU.AddOp <name="model.layers.21.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9404:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9411:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=835)]) -> (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=837, solved=0))] (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=838, solved=0))] (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)]) -> (%9414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=840, solved=0))] (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)]) -> (%9415:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=842, solved=0))] (%9413:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=836)]) -> (%9416:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), )] (%9414:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)]) -> (%9414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), )] (%9414:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)]) -> (%9417:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), )] (%9415:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)]) -> (%9415:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), )] (%9415:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)]) -> (%9418:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), )] (%9416:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)]) -> (%9416:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), )] (%9416:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)]) -> (%9419:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=845, solved=0))] (%9417:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=839)]) -> (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=847, solved=0))] (%9418:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=841)]) -> (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.NegOp <name="model.layers.22.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9422:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9422:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9423:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9423:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9424:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9420:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9425:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), )] (%9425:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %9424:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)]) -> (%9426:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.SliceOp <name="model.layers.22.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.NegOp <name="model.layers.22.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9427:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9427:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9428:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9428:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9429:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9421:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), )] (%9430:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)], %9429:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=848, solved=0), )] (%9431:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=846)]) -> (%9432:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=848)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=848, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), )] (%9432:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=848)]) -> (%9433:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), )] (%9433:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)]) -> (%9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=850, solved=0), )] (%9419:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=843)]) -> (%9436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=850)])
+            linalg.CPU.CastTypeOp <name="model.layers.22.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=850, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851, solved=0), )] (%9436:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=850)]) -> (%9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), )] (%8252:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)]) -> (%9439:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
+            linalg.CPU.ConcatOp <name="model.layers.22.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), )] (%8253:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)]) -> (%9440:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)])
+            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), )] (%9439:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%9441:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)])
+            linalg.CPU.RepeatOp <name="model.layers.22.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), )] (%9440:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9442:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)])
+            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), )] (%9426:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=844)], %9441:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=25)]) -> (%9443:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)])
+            linalg.CPU.MulOp <name="model.layers.22.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=853, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), )] (%9443:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)], %9444:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=853), constant:[0.088388346]]) -> (%9445:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)])
+            linalg.CPU.ReduceMinOp <name="model.layers.22.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), )] (%9445:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)]) -> (%9446:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)])
+            linalg.CPU.AddOp <name="model.layers.22.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=855, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), )] (%9446:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)], %9447:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=855), constant:[-20]]) -> (%9448:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)])
+            linalg.CPU.EqualOp <name="model.layers.22.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=856, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=857, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9449:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=856), constant:[0]]) -> (%9450:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=857)])
+            linalg.CPU.WhereOp <name="model.layers.22.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=857, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), )] (%9450:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=857)], %9445:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=852)], %9448:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)]) -> (%9451:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)])
+            linalg.CPU.SoftmaxOp <name="model.layers.22.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858, solved=0), )] (%9451:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=854)]) -> (%9452:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858)])
+            linalg.CPU.MatMulOp <name="model.layers.22.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), )] (%9452:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=858)], %9442:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=53)]) -> (%9453:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)])
+            linalg.CPU.TransposeOp <name="model.layers.22.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), )] (%9453:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)]) -> (%9454:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)])
+            linalg.CPU.ViewOp <name="model.layers.22.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), )] (%9454:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)]) -> (%9454:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)])
+            linalg.CPU.LinearOp <name="model.layers.22.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=860, solved=0))] (%9454:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=859)]) -> (%9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)])
+            linalg.CPU.AddOp <name="model.layers.22.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9412:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9455:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=861)]) -> (%9456:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.22.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=863, solved=0))] (%9456:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=864, solved=0))] (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)]) -> (%9458:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=866, solved=0))] (%9457:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=862)]) -> (%9459:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)])
+            linalg.CPU.SigmoidOp <name="model.layers.22.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868, solved=0), )] (%9459:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)]) -> (%9460:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868)])
+            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), )] (%9459:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)], %9460:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=868)]) -> (%9461:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)])
+            linalg.CPU.MulOp <name="model.layers.22.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), )] (%9461:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)], %9458:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=865)]) -> (%9462:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)])
+            linalg.CPU.LinearOp <name="model.layers.22.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=869, solved=0))] (%9462:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=867)]) -> (%9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)])
+            linalg.CPU.AddOp <name="model.layers.22.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9456:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9463:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=870)]) -> (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=872, solved=0))] (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=873, solved=0))] (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)]) -> (%9466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=875, solved=0))] (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)]) -> (%9467:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=877, solved=0))] (%9465:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=871)]) -> (%9468:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), )] (%9466:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)]) -> (%9466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), )] (%9466:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)]) -> (%9469:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), )] (%9467:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)]) -> (%9467:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), )] (%9467:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)]) -> (%9470:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), )] (%9468:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)]) -> (%9468:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), )] (%9468:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)]) -> (%9471:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=880, solved=0))] (%9469:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=874)]) -> (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=882, solved=0))] (%9470:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=876)]) -> (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.NegOp <name="model.layers.23.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9474:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9474:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9475:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9475:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9476:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9472:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9477:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), )] (%9477:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %9476:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)]) -> (%9478:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.SliceOp <name="model.layers.23.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.NegOp <name="model.layers.23.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9479:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9479:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9480:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9480:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9481:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9473:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), )] (%9482:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)], %9481:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=883, solved=0), )] (%9483:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=881)]) -> (%9484:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=883)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=883, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), )] (%9484:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=883)]) -> (%9485:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), )] (%9485:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)]) -> (%9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=885, solved=0), )] (%9471:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=878)]) -> (%9488:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=885)])
+            linalg.CPU.CastTypeOp <name="model.layers.23.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=885, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886, solved=0), )] (%9488:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=885)]) -> (%9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), )] (%8254:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)]) -> (%9491:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
+            linalg.CPU.ConcatOp <name="model.layers.23.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), )] (%8255:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)]) -> (%9492:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)])
+            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), )] (%9491:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9493:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)])
+            linalg.CPU.RepeatOp <name="model.layers.23.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), )] (%9492:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9494:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)])
+            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), )] (%9478:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=879)], %9493:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=26)]) -> (%9495:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)])
+            linalg.CPU.MulOp <name="model.layers.23.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=888, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), )] (%9495:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)], %9496:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=888), constant:[0.088388346]]) -> (%9497:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)])
+            linalg.CPU.ReduceMinOp <name="model.layers.23.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), )] (%9497:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)]) -> (%9498:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)])
+            linalg.CPU.AddOp <name="model.layers.23.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=890, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), )] (%9498:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)], %9499:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=890), constant:[-20]]) -> (%9500:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)])
+            linalg.CPU.EqualOp <name="model.layers.23.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=891, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=892, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9501:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=891), constant:[0]]) -> (%9502:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=892)])
+            linalg.CPU.WhereOp <name="model.layers.23.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=892, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), )] (%9502:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=892)], %9497:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=887)], %9500:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)]) -> (%9503:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)])
+            linalg.CPU.SoftmaxOp <name="model.layers.23.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893, solved=0), )] (%9503:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=889)]) -> (%9504:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893)])
+            linalg.CPU.MatMulOp <name="model.layers.23.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), )] (%9504:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=893)], %9494:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=54)]) -> (%9505:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)])
+            linalg.CPU.TransposeOp <name="model.layers.23.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), )] (%9505:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)]) -> (%9506:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)])
+            linalg.CPU.ViewOp <name="model.layers.23.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), )] (%9506:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)]) -> (%9506:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)])
+            linalg.CPU.LinearOp <name="model.layers.23.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=895, solved=0))] (%9506:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=894)]) -> (%9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)])
+            linalg.CPU.AddOp <name="model.layers.23.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9464:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9507:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=896)]) -> (%9508:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.23.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=898, solved=0))] (%9508:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=899, solved=0))] (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)]) -> (%9510:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=901, solved=0))] (%9509:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=897)]) -> (%9511:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)])
+            linalg.CPU.SigmoidOp <name="model.layers.23.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903, solved=0), )] (%9511:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)]) -> (%9512:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903)])
+            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), )] (%9511:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)], %9512:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=903)]) -> (%9513:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)])
+            linalg.CPU.MulOp <name="model.layers.23.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), )] (%9513:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)], %9510:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=900)]) -> (%9514:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)])
+            linalg.CPU.LinearOp <name="model.layers.23.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=904, solved=0))] (%9514:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=902)]) -> (%9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)])
+            linalg.CPU.AddOp <name="model.layers.23.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9508:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9515:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=905)]) -> (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=907, solved=0))] (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=908, solved=0))] (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)]) -> (%9518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=910, solved=0))] (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)]) -> (%9519:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=912, solved=0))] (%9517:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=906)]) -> (%9520:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), )] (%9518:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)]) -> (%9518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), )] (%9518:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)]) -> (%9521:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), )] (%9519:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)]) -> (%9519:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), )] (%9519:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)]) -> (%9522:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), )] (%9520:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)]) -> (%9520:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), )] (%9520:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)]) -> (%9523:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=915, solved=0))] (%9521:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=909)]) -> (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=917, solved=0))] (%9522:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=911)]) -> (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.NegOp <name="model.layers.24.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9526:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9526:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9527:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9527:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9528:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9524:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9529:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), )] (%9529:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %9528:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)]) -> (%9530:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.SliceOp <name="model.layers.24.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.NegOp <name="model.layers.24.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9531:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9531:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9532:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9532:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9533:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9525:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), )] (%9534:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)], %9533:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=918, solved=0), )] (%9535:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=916)]) -> (%9536:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=918)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=918, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), )] (%9536:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=918)]) -> (%9537:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), )] (%9537:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)]) -> (%9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=920, solved=0), )] (%9523:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=913)]) -> (%9540:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=920)])
+            linalg.CPU.CastTypeOp <name="model.layers.24.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=920, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921, solved=0), )] (%9540:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=920)]) -> (%9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), )] (%8256:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)]) -> (%9543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
+            linalg.CPU.ConcatOp <name="model.layers.24.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), )] (%8257:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)]) -> (%9544:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)])
+            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), )] (%9543:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)])
+            linalg.CPU.RepeatOp <name="model.layers.24.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), )] (%9544:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9546:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)])
+            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), )] (%9530:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=914)], %9545:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=27)]) -> (%9547:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)])
+            linalg.CPU.MulOp <name="model.layers.24.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=923, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), )] (%9547:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)], %9548:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=923), constant:[0.088388346]]) -> (%9549:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)])
+            linalg.CPU.ReduceMinOp <name="model.layers.24.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), )] (%9549:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)]) -> (%9550:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)])
+            linalg.CPU.AddOp <name="model.layers.24.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=925, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), )] (%9550:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)], %9551:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=925), constant:[-20]]) -> (%9552:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)])
+            linalg.CPU.EqualOp <name="model.layers.24.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=926, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=927, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9553:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=926), constant:[0]]) -> (%9554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=927)])
+            linalg.CPU.WhereOp <name="model.layers.24.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=927, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), )] (%9554:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=927)], %9549:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=922)], %9552:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)]) -> (%9555:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)])
+            linalg.CPU.SoftmaxOp <name="model.layers.24.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928, solved=0), )] (%9555:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=924)]) -> (%9556:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928)])
+            linalg.CPU.MatMulOp <name="model.layers.24.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), )] (%9556:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=928)], %9546:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=55)]) -> (%9557:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)])
+            linalg.CPU.TransposeOp <name="model.layers.24.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), )] (%9557:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)]) -> (%9558:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)])
+            linalg.CPU.ViewOp <name="model.layers.24.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), )] (%9558:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)]) -> (%9558:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)])
+            linalg.CPU.LinearOp <name="model.layers.24.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=930, solved=0))] (%9558:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=929)]) -> (%9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)])
+            linalg.CPU.AddOp <name="model.layers.24.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9516:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9559:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=931)]) -> (%9560:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.24.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=933, solved=0))] (%9560:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=934, solved=0))] (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)]) -> (%9562:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=936, solved=0))] (%9561:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=932)]) -> (%9563:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)])
+            linalg.CPU.SigmoidOp <name="model.layers.24.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938, solved=0), )] (%9563:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)]) -> (%9564:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938)])
+            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), )] (%9563:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)], %9564:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=938)]) -> (%9565:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)])
+            linalg.CPU.MulOp <name="model.layers.24.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), )] (%9565:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)], %9562:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=935)]) -> (%9566:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)])
+            linalg.CPU.LinearOp <name="model.layers.24.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=939, solved=0))] (%9566:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=937)]) -> (%9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)])
+            linalg.CPU.AddOp <name="model.layers.24.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9560:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9567:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=940)]) -> (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=942, solved=0))] (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=943, solved=0))] (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)]) -> (%9570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=945, solved=0))] (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)]) -> (%9571:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=947, solved=0))] (%9569:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=941)]) -> (%9572:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), )] (%9570:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)]) -> (%9570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), )] (%9570:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)]) -> (%9573:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), )] (%9571:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)]) -> (%9571:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), )] (%9571:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)]) -> (%9574:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), )] (%9572:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)]) -> (%9572:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), )] (%9572:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)]) -> (%9575:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=950, solved=0))] (%9573:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=944)]) -> (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=952, solved=0))] (%9574:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=946)]) -> (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.NegOp <name="model.layers.25.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9578:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9578:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9579:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9579:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9580:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9576:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9581:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), )] (%9581:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %9580:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)]) -> (%9582:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.SliceOp <name="model.layers.25.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.NegOp <name="model.layers.25.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9583:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9583:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9584:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9584:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9585:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9577:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), )] (%9586:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)], %9585:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=953, solved=0), )] (%9587:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=951)]) -> (%9588:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=953)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=953, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), )] (%9588:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=953)]) -> (%9589:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), )] (%9589:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)]) -> (%9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=955, solved=0), )] (%9575:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=948)]) -> (%9592:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=955)])
+            linalg.CPU.CastTypeOp <name="model.layers.25.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=955, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956, solved=0), )] (%9592:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=955)]) -> (%9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), )] (%8258:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)]) -> (%9595:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
+            linalg.CPU.ConcatOp <name="model.layers.25.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), )] (%8259:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)]) -> (%9596:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)])
+            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), )] (%9595:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9597:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)])
+            linalg.CPU.RepeatOp <name="model.layers.25.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), )] (%9596:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9598:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)])
+            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), )] (%9582:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=949)], %9597:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=28)]) -> (%9599:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)])
+            linalg.CPU.MulOp <name="model.layers.25.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=958, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), )] (%9599:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)], %9600:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=958), constant:[0.088388346]]) -> (%9601:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)])
+            linalg.CPU.ReduceMinOp <name="model.layers.25.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), )] (%9601:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)]) -> (%9602:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)])
+            linalg.CPU.AddOp <name="model.layers.25.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=960, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), )] (%9602:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)], %9603:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=960), constant:[-20]]) -> (%9604:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)])
+            linalg.CPU.EqualOp <name="model.layers.25.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=961, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=962, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9605:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=961), constant:[0]]) -> (%9606:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=962)])
+            linalg.CPU.WhereOp <name="model.layers.25.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=962, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), )] (%9606:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=962)], %9601:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=957)], %9604:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)]) -> (%9607:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)])
+            linalg.CPU.SoftmaxOp <name="model.layers.25.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963, solved=0), )] (%9607:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=959)]) -> (%9608:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963)])
+            linalg.CPU.MatMulOp <name="model.layers.25.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), )] (%9608:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=963)], %9598:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=56)]) -> (%9609:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)])
+            linalg.CPU.TransposeOp <name="model.layers.25.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), )] (%9609:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)]) -> (%9610:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)])
+            linalg.CPU.ViewOp <name="model.layers.25.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), )] (%9610:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)]) -> (%9610:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)])
+            linalg.CPU.LinearOp <name="model.layers.25.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=965, solved=0))] (%9610:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=964)]) -> (%9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)])
+            linalg.CPU.AddOp <name="model.layers.25.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9568:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9611:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=966)]) -> (%9612:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.25.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=968, solved=0))] (%9612:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=969, solved=0))] (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)]) -> (%9614:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=971, solved=0))] (%9613:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=967)]) -> (%9615:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)])
+            linalg.CPU.SigmoidOp <name="model.layers.25.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973, solved=0), )] (%9615:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)]) -> (%9616:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973)])
+            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), )] (%9615:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)], %9616:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=973)]) -> (%9617:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)])
+            linalg.CPU.MulOp <name="model.layers.25.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), )] (%9617:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)], %9614:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=970)]) -> (%9618:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)])
+            linalg.CPU.LinearOp <name="model.layers.25.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=974, solved=0))] (%9618:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=972)]) -> (%9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)])
+            linalg.CPU.AddOp <name="model.layers.25.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9612:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9619:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=975)]) -> (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=977, solved=0))] (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=978, solved=0))] (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)]) -> (%9622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=980, solved=0))] (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)]) -> (%9623:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=982, solved=0))] (%9621:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=976)]) -> (%9624:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), )] (%9622:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)]) -> (%9622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), )] (%9622:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)]) -> (%9625:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), )] (%9623:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)]) -> (%9623:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), )] (%9623:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)]) -> (%9626:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), )] (%9624:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)]) -> (%9624:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), )] (%9624:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)]) -> (%9627:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=985, solved=0))] (%9625:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=979)]) -> (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=987, solved=0))] (%9626:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=981)]) -> (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.NegOp <name="model.layers.26.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9630:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9630:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9631:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9631:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9632:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9628:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9633:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), )] (%9633:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %9632:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)]) -> (%9634:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.SliceOp <name="model.layers.26.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.NegOp <name="model.layers.26.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9635:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9635:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9636:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9636:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9637:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9629:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), )] (%9638:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)], %9637:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=988, solved=0), )] (%9639:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=986)]) -> (%9640:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=988)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=988, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), )] (%9640:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=988)]) -> (%9641:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), )] (%9641:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)]) -> (%9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=990, solved=0), )] (%9627:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=983)]) -> (%9644:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=990)])
+            linalg.CPU.CastTypeOp <name="model.layers.26.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=990, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991, solved=0), )] (%9644:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=990)]) -> (%9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), )] (%8260:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)]) -> (%9647:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
+            linalg.CPU.ConcatOp <name="model.layers.26.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), )] (%8261:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)]) -> (%9648:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)])
+            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), )] (%9647:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9649:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)])
+            linalg.CPU.RepeatOp <name="model.layers.26.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), )] (%9648:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9650:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)])
+            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), )] (%9634:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=984)], %9649:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=29)]) -> (%9651:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)])
+            linalg.CPU.MulOp <name="model.layers.26.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=993, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), )] (%9651:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)], %9652:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=993), constant:[0.088388346]]) -> (%9653:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)])
+            linalg.CPU.ReduceMinOp <name="model.layers.26.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), )] (%9653:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)]) -> (%9654:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)])
+            linalg.CPU.AddOp <name="model.layers.26.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=995, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), )] (%9654:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)], %9655:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=995), constant:[-20]]) -> (%9656:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)])
+            linalg.CPU.EqualOp <name="model.layers.26.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=996, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=997, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9657:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=996), constant:[0]]) -> (%9658:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=997)])
+            linalg.CPU.WhereOp <name="model.layers.26.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=997, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), )] (%9658:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=997)], %9653:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=992)], %9656:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)]) -> (%9659:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)])
+            linalg.CPU.SoftmaxOp <name="model.layers.26.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998, solved=0), )] (%9659:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=994)]) -> (%9660:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998)])
+            linalg.CPU.MatMulOp <name="model.layers.26.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), )] (%9660:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=998)], %9650:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=57)]) -> (%9661:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)])
+            linalg.CPU.TransposeOp <name="model.layers.26.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), )] (%9661:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)]) -> (%9662:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)])
+            linalg.CPU.ViewOp <name="model.layers.26.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), )] (%9662:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)]) -> (%9662:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)])
+            linalg.CPU.LinearOp <name="model.layers.26.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1000, solved=0))] (%9662:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=999)]) -> (%9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)])
+            linalg.CPU.AddOp <name="model.layers.26.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9620:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9663:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1001)]) -> (%9664:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.26.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1003, solved=0))] (%9664:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1004, solved=0))] (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)]) -> (%9666:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1006, solved=0))] (%9665:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1002)]) -> (%9667:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)])
+            linalg.CPU.SigmoidOp <name="model.layers.26.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008, solved=0), )] (%9667:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)]) -> (%9668:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008)])
+            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), )] (%9667:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)], %9668:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1008)]) -> (%9669:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)])
+            linalg.CPU.MulOp <name="model.layers.26.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), )] (%9669:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)], %9666:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1005)]) -> (%9670:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)])
+            linalg.CPU.LinearOp <name="model.layers.26.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1009, solved=0))] (%9670:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1007)]) -> (%9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)])
+            linalg.CPU.AddOp <name="model.layers.26.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9664:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9671:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1010)]) -> (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.input_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1012, solved=0))] (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.q_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1013, solved=0))] (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)]) -> (%9674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.k_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1015, solved=0))] (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)]) -> (%9675:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.v_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1017, solved=0))] (%9673:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1011)]) -> (%9676:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), )] (%9674:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)]) -> (%9674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), )] (%9674:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)]) -> (%9677:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), )] (%9675:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)]) -> (%9675:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), )] (%9675:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)]) -> (%9678:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), )] (%9676:tensor<[1, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)]) -> (%9676:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), )] (%9676:tensor<[1, 32, 8, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)]) -> (%9679:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.q_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1020, solved=0))] (%9677:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1014)]) -> (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.self_attn.k_norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1022, solved=0))] (%9678:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1016)]) -> (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), )] (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), )] (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.NegOp <name="model.layers.27.self_attn.Neg.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9682:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9682:tensor<[1, 16, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9683:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9683:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9684:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9680:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9685:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), )] (%9685:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %9684:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)]) -> (%9686:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.SliceOp <name="model.layers.27.self_attn.Slice.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.NegOp <name="model.layers.27.self_attn.Neg.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9687:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9687:tensor<[1, 8, 32, 64], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9688:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9688:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %8267:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=63)]) -> (%9689:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9681:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %8268:tensor<[1, 1, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32), uuid=65)]) -> (%9690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), )] (%9690:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)], %9689:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=1023, solved=0), )] (%9691:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1021)]) -> (%9692:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1023)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=1023, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), )] (%9692:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1023)]) -> (%9693:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), )] (%9693:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)]) -> (%9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018, solved=0), outputs_0:QuantSpec(Raw(type: Float16), uuid=1025, solved=0), )] (%9679:tensor<[1, 8, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1018)]) -> (%9696:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1025)])
+            linalg.CPU.CastTypeOp <name="model.layers.27.self_attn.CastType.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: Float16), uuid=1025, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026, solved=0), )] (%9696:tensor<[1, 8, 32, 128], Float16, CPU>[quant_recipe:QuantSpec(Raw(type: Float16), uuid=1025)]) -> (%9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), )] (%8262:tensor<[1, 8, 128, 992], Int8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)]) -> (%9699:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
+            linalg.CPU.ConcatOp <name="model.layers.27.self_attn.Concat.3"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), )] (%8263:tensor<[1, 8, 992, 128], UInt8PerTensor, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> (%9700:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)])
+            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), )] (%9699:tensor<[1, 8, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9701:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)])
+            linalg.CPU.RepeatOp <name="model.layers.27.self_attn.Repeat.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), outputs_0:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), )] (%9700:tensor<[1, 8, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9702:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)])
+            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), )] (%9686:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1019)], %9701:tensor<[1, 16, 128, 1024], Int8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: -128, quant_max: 127, quant_to_type: Int8, scale_type: Float32), uuid=30)]) -> (%9703:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)])
+            linalg.CPU.MulOp <name="model.layers.27.self_attn.Mul.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1028, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), )] (%9703:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)], %9704:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1028), constant:[0.088388346]]) -> (%9705:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)])
+            linalg.CPU.ReduceMinOp <name="model.layers.27.self_attn.ReduceMin.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), )] (%9705:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)]) -> (%9706:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)])
+            linalg.CPU.AddOp <name="model.layers.27.self_attn.Add.2"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1030, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), )] (%9706:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)], %9707:tensor<[1], Float32, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1030), constant:[-20]]) -> (%9708:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)])
+            linalg.CPU.EqualOp <name="model.layers.27.self_attn.Equal.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt16), uuid=2, solved=0), inputs_1:QuantSpec(Raw(type: UInt16), uuid=1031, solved=0), outputs_0:QuantSpec(Raw(type: UInt8), uuid=1032, solved=0), )] (%8207:tensor<[1, 1, 32, 1024], UInt16, CPU>[qnn_graph_inputs:true, quant_recipe:QuantSpec(Raw(type: UInt16), uuid=2)], %9709:tensor<[1], UInt16, CPU>[quant_recipe:QuantSpec(Raw(type: UInt16), uuid=1031), constant:[0]]) -> (%9710:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1032)])
+            linalg.CPU.WhereOp <name="model.layers.27.self_attn.Where.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(Raw(type: UInt8), uuid=1032, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027, solved=0), inputs_2:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), )] (%9710:tensor<[1, 1, 32, 1024], UInt8, CPU>[quant_recipe:QuantSpec(Raw(type: UInt8), uuid=1032)], %9705:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1027)], %9708:tensor<[1, 16, 32, 1], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)]) -> (%9711:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)])
+            linalg.CPU.SoftmaxOp <name="model.layers.27.self_attn.Softmax.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033, solved=0), )] (%9711:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1029)]) -> (%9712:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033)])
+            linalg.CPU.MatMulOp <name="model.layers.27.self_attn.MatMul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033, solved=0), inputs_1:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), )] (%9712:tensor<[1, 16, 32, 1024], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1033)], %9702:tensor<[1, 16, 1024, 128], UInt8PerTensor, CPU>[quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=58)]) -> (%9713:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)])
+            linalg.CPU.TransposeOp <name="model.layers.27.self_attn.Transpose.4"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), )] (%9713:tensor<[1, 16, 32, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)]) -> (%9714:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)])
+            linalg.CPU.ViewOp <name="model.layers.27.self_attn.View.5"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), )] (%9714:tensor<[1, 32, 16, 128], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)]) -> (%9714:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)])
+            linalg.CPU.LinearOp <name="model.layers.27.self_attn.o_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1035, solved=0))] (%9714:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1034)]) -> (%9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)])
+            linalg.CPU.AddOp <name="model.layers.27.Add.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9672:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9715:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1036)]) -> (%9716:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.layers.27.post_attention_layernorm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1038, solved=0))] (%9716:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.up_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1039, solved=0))] (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)]) -> (%9718:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.gate_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1041, solved=0))] (%9717:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1037)]) -> (%9719:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)])
+            linalg.CPU.SigmoidOp <name="model.layers.27.mlp.Unknown.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043, solved=0), )] (%9719:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)]) -> (%9720:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043)])
+            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.0"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), )] (%9719:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)], %9720:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1043)]) -> (%9721:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)])
+            linalg.CPU.MulOp <name="model.layers.27.mlp.Mul.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), )] (%9721:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)], %9718:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1040)]) -> (%9722:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)])
+            linalg.CPU.LinearOp <name="model.layers.27.mlp.down_proj"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1044, solved=0))] (%9722:tensor<[1, 32, 6144], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1042)]) -> (%9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)])
+            linalg.CPU.AddOp <name="model.layers.27.Add.1"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), inputs_1:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), )] (%9716:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)], %9723:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1045)]) -> (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)])
+            linalg.CPU.RMSNormOp <name="model.norm"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046, solved=0), weight_weight:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1047, solved=0))] (%9724:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=60)]) -> (%9725:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046)])
+            linalg.CPU.LinearOp <name="lm_head"> [qnn_graph_name:model.0.s32, qnn_context_name:context.0, quant_recipe:QuantAnnotation(inputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046, solved=0), outputs_0:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049, solved=0), weight_weight:QuantSpec(LPBQ(quant_min: -8, quant_max: 7, block_size: 32, ch_axis: -1, scale_level_0_bitwidth: 4, quant_to_type: UInt4, scale_1_type: Float32), uuid=1048, solved=0)), using_qnn:true] (%9725:tensor<[1, 32, 2048], UInt16PerTensor, CPU>[quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65535, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1046)]) -> (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)])
+            cf.ReturnOp (%9726:tensor<[1, 32, 151936], UInt16PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(AsymPerTensor(quant_min: 0, quant_max: 65536, quant_to_type: UInt16, scale_type: Float32, zero_point_type: Int32), uuid=1049)], %8291:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=79)], %8343:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=114)], %8395:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=149)], %8447:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=184)], %8499:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=219)], %8551:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=254)], %8603:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=289)], %8655:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=324)], %8707:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=359)], %8759:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=394)], %8811:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=429)], %8863:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=464)], %8915:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=499)], %8967:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=534)], %9019:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=569)], %9071:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=604)], %9123:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=639)], %9175:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=674)], %9227:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=709)], %9279:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=744)], %9331:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=779)], %9383:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=814)], %9435:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=849)], %9487:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=884)], %9539:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=919)], %9591:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=954)], %9643:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=989)], %9695:tensor<[1, 8, 128, 32], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1024)], %8293:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=81)], %8345:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=116)], %8397:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=151)], %8449:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=186)], %8501:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=221)], %8553:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=256)], %8605:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=291)], %8657:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=326)], %8709:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=361)], %8761:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=396)], %8813:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=431)], %8865:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=466)], %8917:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=501)], %8969:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=536)], %9021:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=571)], %9073:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=606)], %9125:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=641)], %9177:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=676)], %9229:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=711)], %9281:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=746)], %9333:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=781)], %9385:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=816)], %9437:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=851)], %9489:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=886)], %9541:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=921)], %9593:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=956)], %9645:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=991)], %9697:tensor<[1, 8, 32, 128], UInt8PerTensor, CPU>[qnn_graph_outputs:true, quant_recipe:QuantSpec(SymPerTensor(quant_min: 0, quant_max: 255, quant_to_type: UInt8, scale_type: Float32), uuid=1026)]) -> ()
+        }
+    }
 }
  
diff --git a/mllm/backends/qnn/QNNUtils.hpp b/mllm/backends/qnn/QNNUtils.hpp
index 5c0483dfb..99695e784 100644
--- a/mllm/backends/qnn/QNNUtils.hpp
+++ b/mllm/backends/qnn/QNNUtils.hpp
@@ -140,7 +140,7 @@ inline void __mllmQnnLoggerCallback(const char* fmt, QnnLog_Level_t level, uint6
 inline const std::string QNN_QUANT_SCALE_NAME = "qnn_quant_scale";
 inline float getQuantScale(Tensor& tensor) {
   if (!tensor.attachedViews().contains(QNN_QUANT_SCALE_NAME)) { return 0.0f; }
-  return tensor.attachedViews()[QNN_QUANT_SCALE_NAME]->ptr<float>()[0];
+  return tensor.attachedViews()[QNN_QUANT_SCALE_NAME].second->ptr<float>()[0];
 }
 
 inline void setQuantScale(Tensor& tensor, float scale) {
@@ -149,7 +149,7 @@ inline void setQuantScale(Tensor& tensor, float scale) {
     t.at<float>({0}) = scale;
     tensor.attach(QNN_QUANT_SCALE_NAME, t.impl());
   } else {
-    tensor.attachedViews()[QNN_QUANT_SCALE_NAME]->ptr<float>()[0] = scale;
+    tensor.attachedViews()[QNN_QUANT_SCALE_NAME].second->ptr<float>()[0] = scale;
   }
 }
 
diff --git a/mllm/backends/qnn/aot/passes/AOTPipeline.cpp b/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
index c60c6aa78..af534a4e9 100644
--- a/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
+++ b/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
@@ -24,8 +24,8 @@ std::vector<std::shared_ptr<ir::Pass>> createQnnAOTLoweringPipeline(QnnAOTEnv* e
     ret.emplace_back(createMergeLLMHeadIntoMainGraphPass());
     ret.emplace_back(createLLMQuantRecipePass());
     ret.emplace_back(createPTQPass());
-    // ret.emplace_back(createSplitLLMGraphPass());
-    // ret.emplace_back(createMarkTensorIOPass());
+    ret.emplace_back(createSplitLLMGraphPass());
+    ret.emplace_back(createMarkTensorIOPass());
     // ret.emplace_back(createLLM2QnnLoweringPass());
   } else {
     MLLM_WARN("This pass currently only supports LLM applications. Please ensure your config contains 'quant_recipe.llm_recipe "
diff --git a/mllm/compile/ir/linalg/Attribute.cpp b/mllm/compile/ir/linalg/Attribute.cpp
index 91634afcd..03d3e4c32 100644
--- a/mllm/compile/ir/linalg/Attribute.cpp
+++ b/mllm/compile/ir/linalg/Attribute.cpp
@@ -121,6 +121,8 @@ void LinalgIRQuantizatonAnnotationAttr::dump(IRPrinter& p) {
     }
     ss << ", ";
     ss << "uuid=" << q->uuid;
+    ss << ", ";
+    ss << "solved=" << q->solved;
     ss << ")";
     return ss.str();
   };
diff --git a/mllm/core/Tensor.cpp b/mllm/core/Tensor.cpp
index ee0d69752..61b5ed65f 100644
--- a/mllm/core/Tensor.cpp
+++ b/mllm/core/Tensor.cpp
@@ -32,12 +32,12 @@ namespace mllm {
 
 void Tensor::operator delete(void* ptr) noexcept {
   ((Tensor*)ptr)->impl_.reset();
-  for (auto& [a, _] : ((Tensor*)ptr)->impl_->attachedViews()) { ((Tensor*)ptr)->impl_->attachedViews()[a].reset(); }
+  for (auto& [a, _] : ((Tensor*)ptr)->impl_->attachedViews()) { ((Tensor*)ptr)->impl_->attachedViews()[a].second.reset(); }
 }
 
 void Tensor::delete_() noexcept {
   this->impl_.reset();
-  for (auto& [a, _] : this->impl_->attachedViews()) { this->impl_->attachedViews()[a].reset(); }
+  for (auto& [a, _] : this->impl_->attachedViews()) { this->impl_->attachedViews()[a].second.reset(); }
 }
 
 /**
@@ -100,13 +100,13 @@ Tensor& Tensor::allocExtraTensorView(const std::string& extra_tensor_name, const
   MLLM_RT_ASSERT_EQ(impl_->attachedViews().count(extra_tensor_name), 0);
   auto storage = TensorStorage::create(shape, dtype, device);
   auto impl = TensorViewImpl::create(shape, storage);
-  impl_->attachedViews().insert({extra_tensor_name, impl});
+  impl_->attachedViews().insert({extra_tensor_name, {true, impl}});
   return *this;
 }
 
 Tensor Tensor::getExtraTensorViewInTensor(const std::string& extra_tensor_name) {
   MLLM_RT_ASSERT_EQ(impl_->attachedViews().count(extra_tensor_name), 1);
-  return Tensor(impl_->attachedViews().at(extra_tensor_name));
+  return Tensor(impl_->attachedViews().at(extra_tensor_name).second);
 }
 
 Tensor Tensor::zeros(const std::vector<int32_t>& shape, DataTypes dtype, DeviceTypes device) {
@@ -521,14 +521,19 @@ size_t Tensor::hash() const {
   std::vector<uint32_t> heap_buf;
 
   auto* buf = stack_buf;
-  size_t count = 1 + impl_->attachedViews().size();
+  size_t count = 1;
+  for (const auto& [_, view] : impl_->attachedViews()) {
+    if (!view.first) { count++; }
+  }
   if (count > kStackCap) {
     heap_buf.resize(count);
     buf = heap_buf.data();
   }
   buf[0] = uuid();
   size_t idx = 1;
-  for (const auto& [_, view] : impl_->attachedViews()) { buf[idx++] = view ? view->uuid() : 0u; }
+  for (const auto& [_, view] : impl_->attachedViews()) {
+    if (!view.first) { buf[idx++] = view.second ? view.second->uuid() : 0u; }
+  }
   return XXH64(buf, count * sizeof(uint32_t), 0);
 }
 
diff --git a/mllm/core/Tensor.hpp b/mllm/core/Tensor.hpp
index 96a375622..90457721f 100644
--- a/mllm/core/Tensor.hpp
+++ b/mllm/core/Tensor.hpp
@@ -698,9 +698,13 @@ class Tensor {
     return *(const_cast<Tensor*>(this)->offsettedPtr<T>(offsets));
   }
 
-  [[nodiscard]] std::unordered_map<std::string, TensorViewImpl::ptr_t>& attachedViews() { return impl_->attachedViews(); }
+  [[nodiscard]] std::unordered_map<std::string, std::pair<bool, TensorViewImpl::ptr_t>>& attachedViews() {
+    return impl_->attachedViews();
+  }
 
-  void attach(const std::string& name, const TensorViewImpl::ptr_t& view) { impl_->attachedViews()[name] = view; }
+  void attach(const std::string& name, const TensorViewImpl::ptr_t& view, bool exclude_from_hash = false) {
+    impl_->attachedViews()[name] = {exclude_from_hash, view};
+  }
 
  private:
   template<typename T>
diff --git a/mllm/core/TensorViewImpl.hpp b/mllm/core/TensorViewImpl.hpp
index 4b7b146b7..61a9fc285 100644
--- a/mllm/core/TensorViewImpl.hpp
+++ b/mllm/core/TensorViewImpl.hpp
@@ -89,7 +89,7 @@ class TensorViewImpl : public std::enable_shared_from_this<TensorViewImpl> {
 
   inline void dropStorage() { storage_ = nullptr; }
 
-  inline std::unordered_map<std::string, TensorViewImpl::ptr_t>& attachedViews() { return attached_views_; }
+  inline std::unordered_map<std::string, std::pair<bool, TensorViewImpl::ptr_t>>& attachedViews() { return attached_views_; }
 
  private:
   int32_t shape_len_ = 0;
@@ -97,7 +97,9 @@ class TensorViewImpl : public std::enable_shared_from_this<TensorViewImpl> {
   int32_t shape_[MLLM_TENSOR_SHAPE_MAX_LEN];
   int32_t stride_[MLLM_TENSOR_SHAPE_MAX_LEN];
   std::shared_ptr<TensorStorage> storage_ = nullptr;
-  std::unordered_map<std::string, TensorViewImpl::ptr_t> attached_views_;
+
+  // std::pair<bool, TensorViewImpl::ptr_t>'s bool for judge if this tensor should be considered in hashing
+  std::unordered_map<std::string, std::pair<bool, TensorViewImpl::ptr_t>> attached_views_;
 };
 
 }  // namespace mllm

From ecbef680c0266c3d2825346e126fd5e7800f876c Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Mon, 5 Jan 2026 08:22:47 +0000
Subject: [PATCH 10/13] fix: AOT Pipeline pass

---
 examples/qwen3_qnn_aot/compile.cpp            |  21 +-
 .../qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp   |  76 ++-----
 mllm/backends/qnn/aot/passes/AOTPipeline.cpp  |   5 +-
 mllm/backends/qnn/aot/passes/AOTPipeline.hpp  |   4 +-
 .../qnn/aot/passes/LLMQuantRecipePass.cpp     | 113 +++-------
 mllm/backends/qnn/aot/passes/PTQPass.cpp      | 207 +++++++++++++++++-
 mllm/compile/ir/linalg/Attribute.cpp          |   2 +
 mllm/core/Tensor.cpp                          |   9 +-
 mllm/core/Tensor.hpp                          |  10 +
 .../qualcomm/transformers/core/qlinear.py     |  55 +++++
 .../qualcomm/transformers/core/rms_norm.py    |  64 +++++-
 .../transformers/qwen3/modeling_qwen3.py      |  22 ++
 .../qualcomm/transformers/qwen3/runner.py     |  13 ++
 .../qualcomm/transformers/qwen3/train.py      |   5 +
 pymllm/convertor/mllm_type_mapping.py         |   1 +
 15 files changed, 452 insertions(+), 155 deletions(-)

diff --git a/examples/qwen3_qnn_aot/compile.cpp b/examples/qwen3_qnn_aot/compile.cpp
index 26f10be05..427d6b0e7 100644
--- a/examples/qwen3_qnn_aot/compile.cpp
+++ b/examples/qwen3_qnn_aot/compile.cpp
@@ -37,17 +37,6 @@ MLLM_MAIN({
   auto model_cfg = mllm::models::qwen3::Qwen3Config(model_cfg_path.get());
   auto model = mllm::models::qwen3::Qwen3ForCausalLM(model_cfg);
   auto params = mllm::load(model_path.get(), mllm::ModelFileVersion::kV2);
-
-  // Gen sin and cos
-  {
-    auto inv = mllm::models::qwen3::makeRoPEInvFreq(model_cfg.head_dim, model_cfg.rope_theta);
-    auto position_ids = mllm::Tensor::empty({CL}, mllm::kInt32, mllm::kCPU).alloc();
-    auto position_ids_ptr = position_ids.ptr<int32_t>();
-    for (int s = 0; s < CL; ++s) { position_ids_ptr[s] = s; }
-    auto [rope_sin, rope_cos] = mllm::models::qwen3::makeRotaryPosEmbedding(position_ids, inv, 1.f);
-    params->push("rope_sin", rope_sin.to(mllm::kUInt16PerTensorSym).setMemType(mllm::kParamsNormal).setName("rope_sin"));
-    params->push("rope_cos", rope_cos.to(mllm::kUInt16PerTensorSym).setMemType(mllm::kParamsNormal).setName("rope_cos"));
-  }
   model.load(params);
 
   // Sequence: [B, N]
@@ -72,8 +61,14 @@ MLLM_MAIN({
         model_cfg.num_key_value_heads,
         model_cfg.head_dim,
         CL - N,
-    }, mllm::kInt8PerTensorSym);
+    }, mllm::kUInt8PerTensorSym);
     trace_inputs[past_value_name] = mllm::Tensor::empty({1, model_cfg.num_key_value_heads, CL - N, model_cfg.head_dim}, mllm::kUInt8PerTensorSym);
+    
+    trace_inputs[past_key_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.scale").impl(), true);
+    trace_inputs[past_key_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.k_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
+
+    trace_inputs[past_value_name].attach("scale", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.scale").impl(), true);
+    trace_inputs[past_value_name].attach("zero_point", params->pull("model.layers." + std::to_string(i) + ".self_attn.v_cast_to_int8_qdq.fake_quant.zero_point").impl(), true);
     // clang-format on
   }
 
@@ -84,7 +79,7 @@ MLLM_MAIN({
                                                mllm::qnn::aot::parseQcomTargetMachineFromJSONFile(qnn_aot_cfg_files.get()));
 
   mllm::ir::PassManager pm(ir["model"]);
-  pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get()));
+  pm.reg(mllm::qnn::aot::createQnnAOTLoweringPipeline(&qnn_aot_env, qnn_aot_cfg_files.get(), params));
   pm.run();
 
   mllm::redirect("qwen3_qnn_aot.mir", [&]() { mllm::print(ir["model"]); });
diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
index 5677d27f2..14241684a 100644
--- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
+++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp
@@ -88,64 +88,30 @@ Tensor QDQ_KV(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch)
   return in;
 }
 
-}  // namespace ptq
+Tensor QDQ_ROPE(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) {
+  auto scale_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.scale";
+  auto zp_name = m->getModuleName() + "." + qdq_name_in_pytorch + ".fake_quant.zero_point";
 
-inline auto makeRoPEInvFreq(int output_dim, float rope_theta) -> Tensor {
-  auto inv_freq = Tensor::empty({output_dim / 2}, kFloat32, kCPU).alloc();
-  auto inv_freq_ptr = inv_freq.ptr<float>();
-  for (int i = 0; i < output_dim / 2; i++) { inv_freq_ptr[i] = 1.0 / std::pow(rope_theta, 2.0 * i / output_dim); }
-  return inv_freq;
-}
+  (void)in.__unsafeSetDType(kUInt16PerTensorAsy);
 
-inline auto makeRotaryPosEmbedding(Tensor& position_ids, const Tensor& inv_freq,
-                                   float attention_scaling = 1.0f) -> std::pair<Tensor, Tensor> {
-  auto batch_size = position_ids.shape()[0];
-  auto seq_len = position_ids.shape()[1];
-  auto inv_freq_len = inv_freq.shape()[0];
-  auto dim = inv_freq_len * 2;
-
-  // Create freqs tensor: position_ids @ inv_freq
-  auto freqs = Tensor::empty({batch_size, seq_len, inv_freq_len}, kFloat32, kCPU).alloc();
-  auto freqs_ptr = freqs.ptr<float>();
-  auto position_ids_ptr = position_ids.ptr<int64_t>();
-  auto inv_freq_ptr = inv_freq.ptr<float>();
-
-  // Compute freqs = position_ids[:, :, None] @ inv_freq[None, :]
-  for (int b = 0; b < batch_size; ++b) {
-    for (int s = 0; s < seq_len; ++s) {
-      auto pos = position_ids_ptr[b * seq_len + s];
-      for (int d = 0; d < inv_freq_len; ++d) {
-        freqs_ptr[b * seq_len * inv_freq_len + s * inv_freq_len + d] = static_cast<float>(pos) * inv_freq_ptr[d];
-      }
+  switch (in.dtype()) {
+    case kUInt16PerTensorAsy: {
+      auto scale = m->getTopParameterFile()->pull(scale_name);
+      auto zp = m->getTopParameterFile()->pull(zp_name);
+      in.attach("scale", scale.impl(), true);
+      in.attach("zero_point", zp.impl(), true);
+      break;
     }
-  }
-
-  // Create sin and cos tensors with shape [batch_size, seq_len, dim]
-  auto sin_emb = Tensor::empty({batch_size, seq_len, dim}, kFloat32, kCPU).alloc();
-  auto cos_emb = Tensor::empty({batch_size, seq_len, dim}, kFloat32, kCPU).alloc();
-  auto sin_ptr = sin_emb.ptr<float>();
-  auto cos_ptr = cos_emb.ptr<float>();
-
-  // Compute sin and cos embeddings: emb = [freqs, freqs]
-  for (int b = 0; b < batch_size; ++b) {
-    for (int s = 0; s < seq_len; ++s) {
-      for (int d = 0; d < inv_freq_len; ++d) {
-        auto freq = freqs_ptr[b * seq_len * inv_freq_len + s * inv_freq_len + d];
-        auto sin_val = std::sin(freq) * attention_scaling;
-        auto cos_val = std::cos(freq) * attention_scaling;
-
-        // Store the same values in both halves: [freqs, freqs]
-        sin_ptr[b * seq_len * dim + s * dim + d] = sin_val;
-        sin_ptr[b * seq_len * dim + s * dim + d + inv_freq_len] = sin_val;
-        cos_ptr[b * seq_len * dim + s * dim + d] = cos_val;
-        cos_ptr[b * seq_len * dim + s * dim + d + inv_freq_len] = cos_val;
-      }
+    default: {
+      MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't Process dtype={}", nameOfType(in.dtype()));
     }
   }
 
-  return {sin_emb, cos_emb};
+  return in;
 }
 
+}  // namespace ptq
+
 class Qwen3MLP final : public nn::Module {
   nn::Linear gate_proj_;
   nn::Linear up_proj_;
@@ -357,8 +323,8 @@ class Qwen3Text final : public nn::Module {
     for (auto [idx, b] : enumerate(decode_blocks_.list())) { b.self_attn_.layer_idx_ = idx; }
     norm_ = reg<nn::RMSNorm>("norm", cfg.rms_norm_eps);
     embedding_ = reg<nn::Embedding>("embed_tokens", cfg.vocab_size, cfg.hidden_size);
-    rope_sin_ = reg<nn::Param>("rope_sin", "rope_sin");
-    rope_cos_ = reg<nn::Param>("rope_cos", "rope_cos");
+    rope_sin_ = reg<nn::Param>("mllm_max_sin_embedding", "model.mllm_max_sin_embedding");
+    rope_cos_ = reg<nn::Param>("mllm_max_cos_embedding", "model.mllm_max_cos_embedding");
   }
 
   std::vector<Tensor> forward(const std::vector<Tensor>& inputs, const std::vector<AnyValue>& args) override {
@@ -372,8 +338,8 @@ class Qwen3Text final : public nn::Module {
 
     auto position_ids = inputs[1];
     auto causal_mask = inputs[2];
-    auto llm_embedding_sin = rope_sin_()[{{0}, position_ids, {kAll}}];
-    auto llm_embedding_cos = rope_cos_()[{{0}, position_ids, {kAll}}];
+    auto llm_embedding_sin = ptq::QDQ_ROPE(this, rope_sin_(), "sin_embedding_input_qdq")[{{0}, position_ids, {kAll}}];
+    auto llm_embedding_cos = ptq::QDQ_ROPE(this, rope_cos_(), "cos_embedding_input_qdq")[{{0}, position_ids, {kAll}}];
 
     std::vector<Tensor> keys;
     std::vector<Tensor> values;
@@ -477,7 +443,7 @@ class Qwen3ForCausalLM : public ARGeneration, public nn::Module {
 
     sequence = llm(llm_inputs)[0];
     sequence = lm_head_(ptq::QDQ(this, sequence, "lm_head_input_qdq"));
-    ptq::QDQ(this, sequence, "lm_head_output_qdq");
+    sequence = ptq::QDQ(this, sequence, "lm_head_output_qdq");
     ir::lowlevel::traceComment("    ╔═════╗   ");
     ir::lowlevel::traceComment("   ║  o o  ║  ");
     ir::lowlevel::traceComment("   ║   ▽   ║  ");
diff --git a/mllm/backends/qnn/aot/passes/AOTPipeline.cpp b/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
index af534a4e9..80fb94ba9 100644
--- a/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
+++ b/mllm/backends/qnn/aot/passes/AOTPipeline.cpp
@@ -8,13 +8,16 @@
 #include "mllm/backends/qnn/aot/passes/OpNamingPass.hpp"
 #include "mllm/backends/qnn/aot/passes/PTQPass.hpp"
 #include "mllm/backends/qnn/aot/passes/SplitLLMGraphPass.hpp"
+#include "mllm/core/ParameterFile.hpp"
 
 namespace mllm::qnn::aot {
-std::vector<std::shared_ptr<ir::Pass>> createQnnAOTLoweringPipeline(QnnAOTEnv* env, const std::string& config_path) {
+std::vector<std::shared_ptr<ir::Pass>> createQnnAOTLoweringPipeline(QnnAOTEnv* env, const std::string& config_path,
+                                                                    const ParameterFile::ptr_t& pf) {
   std::vector<ir::Pass::ptr_t> ret;
 
   AOTCompileContext::getInstance().setEnv(env);
   AOTCompileContext::getInstance().setConfig(config_path);
+  AOTCompileContext::getInstance().setParamFile(pf);
   auto config = AOTCompileContext::getInstance().getConfig();
 
   if (config.contains("quant_recipe") && config["quant_recipe"].contains("llm_recipe")
diff --git a/mllm/backends/qnn/aot/passes/AOTPipeline.hpp b/mllm/backends/qnn/aot/passes/AOTPipeline.hpp
index 0b14f0c11..d854de974 100644
--- a/mllm/backends/qnn/aot/passes/AOTPipeline.hpp
+++ b/mllm/backends/qnn/aot/passes/AOTPipeline.hpp
@@ -7,9 +7,11 @@
 
 #include "mllm/backends/qnn/aot/QnnWrappersAPI.hpp"
 #include "mllm/compile/passes/Pass.hpp"
+#include "mllm/core/ParameterFile.hpp"
 
 namespace mllm::qnn::aot {
 
-std::vector<std::shared_ptr<ir::Pass>> createQnnAOTLoweringPipeline(QnnAOTEnv* env, const std::string& config_path);
+std::vector<std::shared_ptr<ir::Pass>> createQnnAOTLoweringPipeline(QnnAOTEnv* env, const std::string& config_path,
+                                                                    const ParameterFile::ptr_t& pf);
 
 }  // namespace mllm::qnn::aot
diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
index adada76ed..f60ecc14d 100644
--- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
+++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
@@ -387,8 +387,9 @@ bool LLMQuantRecipeRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_pt
   MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->outputs().front()->isa_<ir::tensor::TensorValue>());
   auto t = weight_reg_tensor_ir->outputs().front()->cast_<ir::tensor::TensorValue>();
 
-  auto weight_spec_attr = cloneQuantizationSpecType(
-      writer.getContext(), node->inputs().front()->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>());
+  // FIXME: This dtype is hardcoded. We should make it right.
+  auto weight_spec_attr = writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(
+      ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536, kUInt16, kFloat32, kInt32, Tensor::nil(), Tensor::nil()));
   weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr);
 
   // Get self anno
@@ -421,26 +422,15 @@ bool LLMQuantRecipeIndexPattern::isMatch(const mllm::ir::op_ptr_t& op) {
 
 bool LLMQuantRecipeIndexPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) {
   auto index_ir = node->cast_<ir::linalg::IndexOp>();
-  auto i_0 = *(node->inputs().begin());   // Index what
-  auto o_0 = *(node->outputs().begin());  // Output
+  auto i_0 = *(node->inputs().begin());  // Index what
 
   if (!i_0->getAttr("quant_recipe")) {
     auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_<ir::tensor::TensorValue>());
     i_0->setAttr("quant_recipe", i_0_spec);
   }
 
-  auto o_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), o_0->cast_<ir::tensor::TensorValue>());
-  o_0->setAttr("quant_recipe", o_0_spec);
-
-  auto annotation_attr = writer.create<ir::linalg::LinalgIRQuantizatonAnnotationAttr>();
-  annotation_attr->annotation_.inputs.emplace_back(
-      i_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);
-  annotation_attr->annotation_.outputs.emplace_back(
-      o_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);
-
-  node->setAttr("quant_recipe", annotation_attr);
-
-  return true;
+  return shareQuantSpecSingleInputToSingleOutputAndSetOpQuantAnnoAttr(writer.getContext(),
+                                                                      node->cast_<ir::linalg::LinalgIROp>());
 }
 
 //===----------------------------------------------------------------------===//
@@ -848,85 +838,54 @@ bool LLMQuantRecipeViewPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t
 bool LLMQuantRecipeEmbeddingPattern::isMatch(const mllm::ir::op_ptr_t& op) {
   // Pattern:
   //
-  // embedding(op) -> quantize(op)
+  // embedding(op)
   MLLM_RETURN_FALSE_IF_NOT(op->isa_<ir::linalg::EmbeddingOp>());
-  MLLM_RETURN_FALSE_IF_NOT(op->nextOp());
-  MLLM_RETURN_FALSE_IF_NOT(op->nextOp()->isa_<ir::linalg::CastTypeOp>());
 
   // Already marked.
   MLLM_RETURN_FALSE_IF(op->getAttr("quant_recipe"));
-  MLLM_RETURN_FALSE_IF(op->nextOp()->getAttr("quant_recipe"));
 
   return true;
 }
 
 bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) {
   auto embedding_op = node->cast_<ir::linalg::EmbeddingOp>();
-  auto quantize_op = embedding_op->nextOp()->cast_<ir::linalg::CastTypeOp>();
+  auto i_0 = *(node->inputs().begin());
+  auto o_0 = *(node->outputs().begin());
 
   auto annotation_attr = writer.create<ir::linalg::LinalgIRQuantizatonAnnotationAttr>();
 
-  // Inputs to this Quantization node must be raw type.
-  {
-    auto i_type = quantize_op->inputs().front()->cast_<ir::tensor::TensorValue>()->tensor_.dtype();
-    MLLM_RT_ASSERT(i_type == kFloat32 || i_type == kFloat16);
-    auto i_quant_spec = ir::linalg::QuantizationSpecRaw::create(i_type);
-    annotation_attr->annotation_.inputs.emplace_back(i_quant_spec);
-    quantize_op->inputs().front()->setAttr("quant_recipe",
-                                           writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(i_quant_spec));
+  if (!i_0->getAttr("quant_recipe")) {
+    auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_<ir::tensor::TensorValue>());
+    i_0->setAttr("quant_recipe", i_0_spec);
+  } else {
+    annotation_attr->annotation_.inputs.emplace_back(
+        i_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);
   }
 
-  // Outputs to this Quantization node must be int8 or int16
-  {
-    auto o_type = quantize_op->outputs().front()->cast_<ir::tensor::TensorValue>()->tensor_.dtype();
-    ir::linalg::QuantizationSpec::ptr_t o_quant_spec = nullptr;
-    switch (o_type) {
-      case kInt8PerTensorSym: {
-        o_quant_spec = ir::linalg::QuantizationSpecSymPerTensor::create(-128, 127, kInt8, kFloat32, Tensor::nil());
-        break;
-      }
-      case kUInt8PerTensorSym: {
-        o_quant_spec = ir::linalg::QuantizationSpecSymPerTensor::create(0, 255, kUInt8, kFloat32, Tensor::nil());
-        break;
-      }
-      case kInt16PerTensorSym: {
-        o_quant_spec = ir::linalg::QuantizationSpecSymPerTensor::create(-32768, 32767, kInt16, kFloat32, Tensor::nil());
-        break;
-      }
-      case kUInt16PerTensorSym: {
-        o_quant_spec = ir::linalg::QuantizationSpecSymPerTensor::create(0, 65535, kUInt16, kFloat32, Tensor::nil());
-        break;
-      }
-      case kUInt16PerTensorAsy: {
-        o_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65535, kUInt16, kFloat32, kInt32, Tensor::nil(),
-                                                                         Tensor::nil());
-        break;
-      }
-      default: {
-        NYI("Only support [uint16, int16, uint8, int8], [sym] for now.");
-      }
-    }
-
-    // Weights
-    auto weight_name = embedding_op->getAOp()->getName() + ".weight";
-    auto weight_reg_tensor_ir = writer.getContext()->lookupSymbolTable(weight_name);
-    MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir);
-    MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->isa_<ir::tensor::RegisterOp>());
-    MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->outputs().front()->isa_<ir::tensor::TensorValue>());
-    auto weight_tensor = weight_reg_tensor_ir->outputs().front()->cast_<ir::tensor::TensorValue>();
-
-    annotation_attr->annotation_.outputs.emplace_back(o_quant_spec);
-    quantize_op->outputs().front()->setAttr("quant_recipe",
-                                            writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(o_quant_spec));
-
-    // Embedding weight quantization method same as outputs, but not share, just same type
-    auto weight_spec_attr = genSimpleQuantizationSpecAttr(writer.getContext(), weight_tensor);
-    weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr);
-    annotation_attr->annotation_.weights.insert({"weight", weight_spec_attr->spec_});
+  if (!o_0->getAttr("quant_recipe")) {
+    auto o_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), o_0->cast_<ir::tensor::TensorValue>());
+    o_0->setAttr("quant_recipe", o_0_spec);
+    annotation_attr->annotation_.outputs.emplace_back(o_0_spec->spec_);
+  } else {
+    annotation_attr->annotation_.inputs.emplace_back(
+        o_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);
   }
 
+  // Weights
+  auto weight_name = embedding_op->getAOp()->getName() + ".weight";
+  auto weight_reg_tensor_ir = writer.getContext()->lookupSymbolTable(weight_name);
+  MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir);
+  MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->isa_<ir::tensor::RegisterOp>());
+  MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->outputs().front()->isa_<ir::tensor::TensorValue>());
+  auto weight_tensor = weight_reg_tensor_ir->outputs().front()->cast_<ir::tensor::TensorValue>();
+
+  // Embedding weight quantization method same as outputs, but not share, just same type
+  auto weight_spec_attr = genSimpleQuantizationSpecAttr(writer.getContext(), weight_tensor);
+  weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr);
+  annotation_attr->annotation_.weights.insert({"weight", weight_spec_attr->spec_});
+
   // Attach to quantize node
-  node->nextOp()->setAttr("quant_recipe", annotation_attr);
+  node->setAttr("quant_recipe", annotation_attr);
 
   return true;
 }
diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp
index 9d4cabee3..0539b23a2 100644
--- a/mllm/backends/qnn/aot/passes/PTQPass.cpp
+++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp
@@ -1,28 +1,229 @@
 // Copyright (c) MLLM Team.
 // Licensed under the MIT License.
 
+#include <memory>
+
 #include "mllm/backends/qnn/aot/passes/PTQPass.hpp"
 #include "mllm/backends/qnn/aot/passes/AOTCompileContext.hpp"
 #include "mllm/compile/ir/builtin/Op.hpp"
 #include "mllm/compile/ir/graph/Op.hpp"
+#include "mllm/compile/ir/linalg/Attribute.hpp"
 #include "mllm/compile/ir/linalg/Op.hpp"
 #include "mllm/compile/ir/tensor/Value.hpp"
 #include "mllm/compile/ir/cf/Op.hpp"
 #include "mllm/compile/ir/Node.hpp"
 #include "mllm/core/OpTypes.hpp"
+#include "mllm/core/ParameterFile.hpp"
 #include "mllm/utils/Common.hpp"
 
 namespace mllm::qnn::aot {
 
 namespace {
 
-void solveStaticWeights() {}
+template<typename T>
+void checkTypeLimits(Tensor in, int quant_min, int quant_max) {  // NOLINT
+  auto numel = in.numel();
+  for (int i = 0; i < numel; ++i) {
+    MLLM_RT_ASSERT(*(in.ptr<T>() + i) >= quant_min);
+    MLLM_RT_ASSERT(*(in.ptr<T>() + i) <= quant_max);
+  }
+}
+
+void solveLinearWeight(const ir::IRContext::ptr_t& ctx, const ParameterFile::ptr_t& pf,
+                       const ir::linalg::LinalgIROp::ptr_t& op) {
+  auto mllm_op = op->getAOp();
+  MLLM_INFO("PTQPass working on Op: {}'s weight", mllm_op->getName());
+  auto weight_spec =
+      op->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonAnnotationAttr>()->annotation_.weights.at("weight");
+
+  if (weight_spec->solved) return;
+
+  switch (weight_spec->type) {
+    case ir::linalg::QuantizationSpecType::kLPBQ: {
+      auto this_spec = std::static_pointer_cast<ir::linalg::QuantizationSpecLPBQ>(weight_spec);
+      auto scale1 = pf->pull(mllm_op->getName() + ".scale1");  // using uint8 to store uint4
+      auto scale2 = pf->pull(mllm_op->getName() + ".scale2");
+      auto weight = pf->pull(mllm_op->getName() + ".weight");
+
+      // FIXME weight maybe error, Check qnn eats int8 or uint8. Here weight using int8 to store int4.
+      checkTypeLimits<int8_t>(weight, -8, 7);   // Int4
+      checkTypeLimits<uint8_t>(scale1, 0, 16);  // UInt4
+
+      this_spec->scale_level_0_int = scale1;
+      this_spec->scale_level_1_fp = scale2;
+
+      weight_spec->solved = true;
+      break;
+    }
+    default: {
+      NYI("quant recipe type not support");
+    }
+  }
+}
+
+void solveRMSNormWeight(const ir::IRContext::ptr_t& ctx, const ParameterFile::ptr_t& pf,
+                        const ir::linalg::LinalgIROp::ptr_t& op) {
+  auto mllm_op = op->getAOp();
+  MLLM_INFO("PTQPass working on Op: {}'s weight", mllm_op->getName());
+  auto weight_spec =
+      op->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonAnnotationAttr>()->annotation_.weights.at("weight");
+
+  if (weight_spec->solved) return;
+
+  switch (weight_spec->type) {
+    case ir::linalg::QuantizationSpecType::kRaw: {
+      weight_spec->solved = true;
+      break;
+    }
+    case ir::linalg::QuantizationSpecType::kAsymPerTensor: {
+      auto this_spec = std::static_pointer_cast<ir::linalg::QuantizationSpecAsymPerTensor>(weight_spec);
+      auto scale = pf->pull(mllm_op->getName() + ".scale");
+      auto zero_point = pf->pull(mllm_op->getName() + ".zero_point");
+      this_spec->scale = scale;
+      this_spec->zero_point = zero_point;
+      checkTypeLimits<uint16_t>(pf->pull(mllm_op->getName() + ".weight"), this_spec->quant_min, this_spec->quant_max);
+      MLLM_RT_ASSERT(scale.dtype() == kFloat32);
+      MLLM_RT_ASSERT(scale.rank() == 1);
+      MLLM_RT_ASSERT(scale.item<float>() > 0);
+      MLLM_RT_ASSERT(zero_point.dtype() == kInt32);
+      MLLM_RT_ASSERT(zero_point.rank() == 1);
+      MLLM_RT_ASSERT(zero_point.item<int32_t>() >= 0);
+      weight_spec->solved = true;
+      break;
+    }
+    default: {
+      NYI("quant recipe type not support");
+    }
+  }
+}
+
+void solveEmbeddingWeight(const ir::IRContext::ptr_t& ctx, const ParameterFile::ptr_t& pf,
+                          const ir::linalg::LinalgIROp::ptr_t& op) {
+  auto mllm_op = op->getAOp();
+  MLLM_INFO("PTQPass working on Op: {}'s weight", mllm_op->getName());
+  auto weight_spec =
+      op->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonAnnotationAttr>()->annotation_.weights.at("weight");
+
+  if (weight_spec->solved) return;
+
+  switch (weight_spec->type) {
+    case ir::linalg::QuantizationSpecType::kRaw: {
+      weight_spec->solved = true;
+      break;
+    }
+    default: {
+      NYI("quant recipe type not support");
+    }
+  }
+}
+
+void recursiveSolveWeights(const std::shared_ptr<ir::IRContext>& ir_ctx, const ir::graph::SubGraphOp::ptr_t& call_op,
+                           const ParameterFile::ptr_t& pf) {
+  auto wow = ir::IRWriter(ir_ctx, call_op->getTopRegion());
+  wow.walk<ir::Op>([&](ir::IRWriter& w, const ir::Op::ptr_t& op) -> ir::IRWriter::WalkResult {
+    if (op->isa_<ir::linalg::LinearOp>()) { solveLinearWeight(w.getContext(), pf, op->cast_<ir::linalg::LinalgIROp>()); }
+    if (op->isa_<ir::linalg::RMSNormOp>()) { solveRMSNormWeight(w.getContext(), pf, op->cast_<ir::linalg::LinalgIROp>()); }
+    if (op->isa_<ir::linalg::EmbeddingOp>()) { solveEmbeddingWeight(w.getContext(), pf, op->cast_<ir::linalg::LinalgIROp>()); }
+    if (op->isa_<ir::graph::CallGraphOp>()) {
+      auto ns = op->cast_<ir::graph::CallGraphOp>()->getSymbolAttr()->str();
+      recursiveSolveWeights(w.getContext(), w.getContext()->lookupSymbolTable(ns)->cast_<ir::graph::SubGraphOp>(), pf);
+    }
+    return ir::IRWriter::WALK_CONTINUE;
+  });
+}
 
-void solveStaticRoPE() {}
+void __recursiveSolveNormalImpl(const ir::Val::ptr_t& v) {
+  MLLM_RT_ASSERT(v->isa_<ir::tensor::TensorValue>());
+  auto tv = v->cast_<ir::tensor::TensorValue>();
+  MLLM_RT_ASSERT(tv->getAttr("quant_recipe"));
+  auto f_spec = tv->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>();
+
+  if (f_spec->spec_->solved) { return; }
+
+  switch (f_spec->spec_->type) {
+    case ir::linalg::QuantizationSpecType::kAsymPerTensor: {
+      if (!tv->tensor_.hasAttachedView("scale") || !tv->tensor_.hasAttachedView("zero_point")) { return; }
+      auto scale = tv->tensor_.getExtraTensorViewInTensor("scale");
+      auto zero_point = tv->tensor_.getExtraTensorViewInTensor("zero_point");
+      auto this_spec = std::static_pointer_cast<ir::linalg::QuantizationSpecAsymPerTensor>(f_spec->spec_);
+      this_spec->scale = scale;
+      this_spec->zero_point = zero_point;
+      this_spec->solved = true;
+      break;
+    }
+    case ir::linalg::QuantizationSpecType::kSymPerTensor: {
+      if (!tv->tensor_.hasAttachedView("scale")) { return; }
+      auto scale = tv->tensor_.getExtraTensorViewInTensor("scale");
+      auto this_spec = std::static_pointer_cast<ir::linalg::QuantizationSpecSymPerTensor>(f_spec->spec_);
+      this_spec->scale = scale;
+      this_spec->solved = true;
+      break;
+    }
+    case ir::linalg::QuantizationSpecType::kRaw: {
+      auto this_spec = std::static_pointer_cast<ir::linalg::QuantizationSpecRaw>(f_spec->spec_);
+      this_spec->solved = true;
+      break;
+    }
+    default: {
+      NYI("quant recipe type not support on tensor: {}", v->name());
+    }
+  }
+}
+
+void recursiveSolveNormal(const std::shared_ptr<ir::IRContext>& ir_ctx, const ir::graph::SubGraphOp::ptr_t& call_op,
+                          const ParameterFile::ptr_t& pf) {
+  auto wow = ir::IRWriter(ir_ctx, call_op->getTopRegion());
+  wow.walk<ir::Op>([&](ir::IRWriter& w, const ir::Op::ptr_t& op) -> ir::IRWriter::WalkResult {
+    if (op->isa_<ir::linalg::LinalgIROp>()) {
+      MLLM_INFO("PTQPass relax working on Op: {}'s tensors", op->cast_<ir::linalg::LinalgIROp>()->getAOp()->getName());
+
+      auto inputs = op->inputs();
+      auto outputs = op->outputs();
+
+      for (auto iii : inputs) { __recursiveSolveNormalImpl(iii->cast_<ir::Val>()); }
+      for (auto ooo : inputs) { __recursiveSolveNormalImpl(ooo->cast_<ir::Val>()); }
+    }
+
+    if (op->isa_<ir::graph::CallGraphOp>()) {
+      auto ns = op->cast_<ir::graph::CallGraphOp>()->getSymbolAttr()->str();
+      recursiveSolveNormal(w.getContext(), w.getContext()->lookupSymbolTable(ns)->cast_<ir::graph::SubGraphOp>(), pf);
+    }
+    return ir::IRWriter::WALK_CONTINUE;
+  });
+}
 
 }  // namespace
 
-uint8_t PTQPass::run(const ir::node_ptr_t& op) { return ir::PASS_RET_SUCCESS; }
+uint8_t PTQPass::run(const ir::node_ptr_t& op) {
+  auto pf = AOTCompileContext::getInstance().getParamFile();
+
+  // The top op should be ModuleOp
+  MLLM_RT_ASSERT(op->isa_<ir::ModuleOp>());
+
+  auto module_op = op->cast_<ir::ModuleOp>();
+  auto writer = ir::IRWriter(getCtx(), module_op->getTopRegion());
+
+  ir::graph::CallGraphOp::ptr_t call_main_graph_op = nullptr;
+  writer.walk<ir::graph::CallGraphOp>(
+      [&](ir::IRWriter& /*writer*/, const ir::graph::CallGraphOp::ptr_t& call_op) -> ir::IRWriter::WalkResult {
+        MLLM_RT_ASSERT_EQ(call_main_graph_op, nullptr);
+
+        call_main_graph_op = call_op;
+        return ir::IRWriter::WalkResult::WALK_CONTINUE;
+      });
+
+  // Solve all registered weight
+  recursiveSolveWeights(writer.getContext(),
+                        getCtx()->lookupSymbolTable(call_main_graph_op->getSymbolAttr()->str())->cast_<ir::graph::SubGraphOp>(),
+                        pf);
+
+  // Solve other normal tensors
+  recursiveSolveNormal(writer.getContext(),
+                       getCtx()->lookupSymbolTable(call_main_graph_op->getSymbolAttr()->str())->cast_<ir::graph::SubGraphOp>(),
+                       pf);
+
+  return ir::PASS_RET_SUCCESS;
+}
 
 ir::Pass::ptr_t createPTQPass() { return std::make_shared<PTQPass>(); }
 
diff --git a/mllm/compile/ir/linalg/Attribute.cpp b/mllm/compile/ir/linalg/Attribute.cpp
index 03d3e4c32..09b35fc96 100644
--- a/mllm/compile/ir/linalg/Attribute.cpp
+++ b/mllm/compile/ir/linalg/Attribute.cpp
@@ -264,6 +264,8 @@ void LinalgIRQuantizatonSpecAttr::dump(IRPrinter& p) {
     }
     ss << ", ";
     ss << "uuid=" << q->uuid;
+    ss << ", ";
+    ss << "solved=" << q->solved;
     ss << ")";
     return ss.str();
   };
diff --git a/mllm/core/Tensor.cpp b/mllm/core/Tensor.cpp
index 61b5ed65f..4c51c1be8 100644
--- a/mllm/core/Tensor.cpp
+++ b/mllm/core/Tensor.cpp
@@ -105,7 +105,9 @@ Tensor& Tensor::allocExtraTensorView(const std::string& extra_tensor_name, const
 }
 
 Tensor Tensor::getExtraTensorViewInTensor(const std::string& extra_tensor_name) {
-  MLLM_RT_ASSERT_EQ(impl_->attachedViews().count(extra_tensor_name), 1);
+  if (impl_->attachedViews().count(extra_tensor_name) != 1) {
+    MLLM_ERROR_EXIT(ExitCode::kCoreError, "Can't find {}", extra_tensor_name);
+  }
   return Tensor(impl_->attachedViews().at(extra_tensor_name).second);
 }
 
@@ -503,6 +505,11 @@ Tensor& Tensor::setMemType(TensorMemTypes mem_type) {
 
 DataTypes Tensor::dtype() const { return impl()->dtype(); }
 
+Tensor Tensor::__unsafeSetDType(DataTypes dt) const {
+  impl_->storage()->dtype_ = dt;
+  return *this;
+}
+
 DeviceTypes Tensor::device() const { return impl()->device(); }
 
 Tensor::shape_t Tensor::shape() const { return impl()->shape(); }
diff --git a/mllm/core/Tensor.hpp b/mllm/core/Tensor.hpp
index 90457721f..5046a8e91 100644
--- a/mllm/core/Tensor.hpp
+++ b/mllm/core/Tensor.hpp
@@ -467,6 +467,14 @@ class Tensor {
    */
   [[nodiscard]] DataTypes dtype() const;
 
+  /**
+   * @brief Unsafe set One Datatype
+   *
+   * @param dt
+   * @return Tensor
+   */
+  [[nodiscard]] Tensor __unsafeSetDType(DataTypes dt) const;
+
   /**
    * @brief Gets device location.
    * @return Current device type.
@@ -702,6 +710,8 @@ class Tensor {
     return impl_->attachedViews();
   }
 
+  bool hasAttachedView(const std::string& name) { return impl_->attachedViews().count(name) == 1; }
+
   void attach(const std::string& name, const TensorViewImpl::ptr_t& view, bool exclude_from_hash = false) {
     impl_->attachedViews()[name] = {exclude_from_hash, view};
   }
diff --git a/pymllm/backends/qualcomm/transformers/core/qlinear.py b/pymllm/backends/qualcomm/transformers/core/qlinear.py
index bbfcc60df..54006a197 100644
--- a/pymllm/backends/qualcomm/transformers/core/qlinear.py
+++ b/pymllm/backends/qualcomm/transformers/core/qlinear.py
@@ -17,7 +17,9 @@ def __init__(self, in_features, out_features, bias=True):
 
         self.act_quant = None
         self.weight_quant = None
+        self.deploy_mode = False
 
+    @torch.no_grad()
     def freeze_weight(self):
         """PTQ Core: Observe current weights, calculate and fix Scale/ZP"""
         if self.weight_quant is not None:
@@ -66,12 +68,49 @@ def __init__(self, in_features, out_features, bias=True):
         )
 
     def forward(self, x):
+        assert self.deploy_mode is False
         # Activation quantization logic (add act_quant here if needed)
         x_q = x
         # Apply fake quantization: use fixed scale if frozen, otherwise update in real-time
         w_q = self.weight_quant(self.weight)
         return F.linear(x_q, w_q, self.bias)
 
+    @torch.no_grad()
+    def convert_to_deploy(self):
+        if self.deploy_mode:
+            return
+
+        # 1. Ensure Observer is frozen
+        if self.weight_quant.scale is None:
+            self.freeze_weight()
+
+        scale = self.weight_quant.scale
+        zero_point = self.weight_quant.zero_point
+
+        # 2. Use PyTorch native API for Per-Channel quantization
+        # This handles per-channel complexity and returns quantized tensor
+        w_q_obj = torch.quantize_per_channel(
+            self.weight.float(), scale, zero_point, axis=0, dtype=torch.qint8
+        )
+
+        # 3. Extract pure integer data
+        w_int = w_q_obj.int_repr()
+
+        # 4. Replace Parameter with Buffer
+        del self.weight
+        # Register buffer named 'weight' to maintain name consistency
+        self.register_buffer("weight", w_int)
+        self.register_buffer("scale", scale)
+        self.register_buffer("zero_point", zero_point)
+
+        # Remove fake quant module to reduce model size
+        del self.weight_quant
+
+        self.deploy_mode = True
+        print(
+            f"[{self.__class__.__name__}] Converted to deploy. Weight shape: {self.weight.shape}, dtype: {self.weight.dtype}"
+        )
+
 
 # --- 2. LPBQ (Double Quantization) Scheme ---
 class DoubleQuantizer(nn.Module):
@@ -150,3 +189,19 @@ def forward(self, x):
         # Must use quantized weights w_q for computation
         w_q = self.weight_quant(self.weight)
         return F.linear(x, w_q, self.bias)
+
+    @torch.no_grad()
+    def convert_to_deploy(self):
+        if self.deploy_mode:
+            return
+
+        del self.weight
+        self.register_buffer("weight", self.weight_quant.weight_q)
+        self.register_buffer("scale1", self.weight_quant.scale_1_uint4)
+        self.register_buffer("scale2", self.weight_quant.scale_2_fp32)
+        del self.weight_quant
+
+        self.deploy_mode = True
+        print(
+            f"[{self.__class__.__name__}] Converted to deploy. Original float weight removed."
+        )
diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
index 5606dafaa..ec6345d64 100644
--- a/pymllm/backends/qualcomm/transformers/core/rms_norm.py
+++ b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
@@ -12,6 +12,7 @@ def __init__(
     ):
         super().__init__()
         self.eps = eps
+        self.quant_bits = quant_bits
         if isinstance(normalized_shape, int):
             normalized_shape = (normalized_shape,)
 
@@ -20,12 +21,12 @@ def __init__(
         # Quantization configuration for Weight
         self.weight_fake_quant = FakeQuantize(
             observer=MinMaxObserver.with_args(
-                qscheme=torch.per_tensor_symmetric, dtype=torch.qint32
+                qscheme=torch.per_tensor_affine, dtype=torch.qint32
             ),
-            quant_min=-(2 ** (quant_bits - 1)),
-            quant_max=2 ** (quant_bits - 1) - 1,
+            quant_min=0,
+            quant_max=2 ** (quant_bits) - 1,
             dtype=torch.qint32,
-            qscheme=torch.per_tensor_symmetric,
+            qscheme=torch.per_tensor_affine,
         )
 
     def forward(self, x):
@@ -42,6 +43,61 @@ def forward(self, x):
 
         return (x_normed * w_q).to(input_dtype)
 
+    @torch.no_grad()
+    def convert_to_deploy(self):
+        """
+        In-place replacement of self.weight:
+        Float Parameter -> Int Buffer
+        """
+        # 1. Ensure quantization parameters are ready
+        if self.weight_fake_quant.scale is None:
+            self.freeze_weight()
+
+        scale = self.weight_fake_quant.scale
+        zero_point = self.weight_fake_quant.zero_point
+        quant_min = self.weight_fake_quant.quant_min
+        quant_max = self.weight_fake_quant.quant_max
+
+        # 2. Calculate integer values
+        # w_int = round(w / s + zp)
+        w_int = torch.round(self.weight / scale + zero_point).clamp(
+            quant_min, quant_max
+        )
+
+        # 3. Set target integer type
+        if self.quant_bits <= 8:
+            target_dtype = torch.int8
+        elif self.quant_bits <= 16:
+            target_dtype = torch.int16
+        else:
+            target_dtype = torch.int32
+
+        w_int = w_int.to(target_dtype)
+
+        # === Key steps: Replacement operations ===
+
+        # A. Delete original Parameter 'weight'
+        # Must delete first, otherwise cannot register buffer with same name
+        del self.weight
+
+        # B. Register Buffer with same name 'weight'
+        # This makes state_dict['weight'] become Int Tensor
+        self.register_buffer("weight", w_int)
+
+        # C. Register Scale (usually needed by engine)
+        self.register_buffer("scale", scale)
+        self.register_buffer("zero_point", zero_point)
+
+        # D. Clean up unnecessary modules
+        if hasattr(self, "weight_fake_quant"):
+            del self.weight_fake_quant
+
+        class_name = self.__class__.__name__
+        instance_class_name = type(self).__name__
+        print(
+            f"Class: {class_name}, Instance: {instance_class_name}, Deploy Mode Activated. 'weight' is now {self.weight.dtype} buffer. zp is {zero_point}"
+        )
+
     @torch.no_grad()
     def freeze_weight(self):
         """
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
index 5148684af..9c0696328 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py
@@ -473,6 +473,28 @@ def __init__(self, config: Qwen3Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @torch.no_grad()
+    def convert_rope_for_deploy(self):
+        sin_scale = self.sin_embedding_input_qdq.fake_quant.scale
+        sin_zero_point = self.sin_embedding_input_qdq.fake_quant.zero_point
+        sin_quant_min = self.sin_embedding_input_qdq.fake_quant.quant_min
+        sin_quant_max = self.sin_embedding_input_qdq.fake_quant.quant_max
+
+        cos_scale = self.cos_embedding_input_qdq.fake_quant.scale
+        cos_zero_point = self.cos_embedding_input_qdq.fake_quant.zero_point
+        cos_quant_min = self.cos_embedding_input_qdq.fake_quant.quant_min
+        cos_quant_max = self.cos_embedding_input_qdq.fake_quant.quant_max
+
+        sin_int = torch.round(
+            self.mllm_max_sin_embedding / sin_scale + sin_zero_point
+        ).clamp(sin_quant_min, sin_quant_max)
+        self.mllm_max_sin_embedding = sin_int.to(torch.uint16)
+
+        cos_int = torch.round(
+            self.mllm_max_cos_embedding / cos_scale + cos_zero_point
+        ).clamp(cos_quant_min, cos_quant_max)
+        self.mllm_max_cos_embedding = cos_int.to(torch.uint16)
+
     @check_model_inputs()
     @auto_docstring
     def forward(
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
index 37f8bae16..7c36940ab 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py
@@ -31,6 +31,15 @@ def enable_qdq_observer(m):
         m.enable_observer()
 
 
+def convert_weight(m):
+    if (
+        isinstance(m, QLinearLPBQ)
+        or isinstance(m, QLinearW8A16_PerChannelSym)
+        or isinstance(m, QRMSNorm)
+    ):
+        m.convert_to_deploy()
+
+
 class Qwen3Quantizer:
     def __init__(self, model_path: str, mllm_qualcomm_max_length=2048):
         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -167,3 +176,7 @@ def calibrate(self, num_samples=64, max_seq_length=512):
         # 4. Close Observer, freeze calibrated quantization parameters
         self.freeze_activation()
         print("\nCalibration completed, activation quantization parameters frozen.")
+
+    def convert(self):
+        self.model.apply(convert_weight)
+        self.model.model.convert_rope_for_deploy()
diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py
index 8432e4812..13ad2785a 100644
--- a/pymllm/backends/qualcomm/transformers/qwen3/train.py
+++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py
@@ -40,9 +40,14 @@ def main():
     m.calibrate(num_samples=args.num_samples, max_seq_length=args.max_length)
     # m.compile()
     m.infer(args.infer_text)
+
+    # !!!
+    # Things below is for deploy. We will turn all fp32 weights and some buffers(rope) to quantized dtype.
+    # !!!
     m.model.lm_head.weight = torch.nn.Parameter(
         m.model.model.embed_tokens.weight.clone()
     )
+    m.convert()
 
     os.makedirs(args.output_dir, exist_ok=True)
     model_save_path = os.path.join(args.output_dir, "model.safetensors")
diff --git a/pymllm/convertor/mllm_type_mapping.py b/pymllm/convertor/mllm_type_mapping.py
index 0b98b7e6e..05ea544c2 100644
--- a/pymllm/convertor/mllm_type_mapping.py
+++ b/pymllm/convertor/mllm_type_mapping.py
@@ -91,6 +91,7 @@
             torch.qint8: 16,  # kInt8
             torch.quint8: 129,  # kUInt8
             torch.qint32: 18,  # kInt32
+            torch.uint16: 130,  # kUInt16
         }
     )
 

From 82900b15bdf8e05c1456742a68b63df359beb122 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Mon, 5 Jan 2026 08:55:55 +0000
Subject: [PATCH 11/13] fix: mismatched outputs and inputs

---
 mllm/backends/qnn/aot/passes/PTQPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp
index 0539b23a2..8bd523641 100644
--- a/mllm/backends/qnn/aot/passes/PTQPass.cpp
+++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp
@@ -181,7 +181,7 @@ void recursiveSolveNormal(const std::shared_ptr<ir::IRContext>& ir_ctx, const ir
       auto outputs = op->outputs();
 
       for (auto iii : inputs) { __recursiveSolveNormalImpl(iii->cast_<ir::Val>()); }
-      for (auto ooo : inputs) { __recursiveSolveNormalImpl(ooo->cast_<ir::Val>()); }
+      for (auto ooo : outputs) { __recursiveSolveNormalImpl(ooo->cast_<ir::Val>()); }
     }
 
     if (op->isa_<ir::graph::CallGraphOp>()) {

From 54d9927cb8d31cbb0016b0064ae5583925fb0f16 Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Mon, 5 Jan 2026 10:13:57 +0000
Subject: [PATCH 12/13] fix: typos

---
 .../qnn/aot/passes/LLMQuantRecipePass.cpp     |  6 +-
 mllm/backends/qnn/aot/passes/PTQPass.cpp      |  6 +-
 mllm/core/OpTypes.hpp                         |  1 +
 .../qualcomm/transformers/core/rms_norm.py    |  2 +-
 .../transformers/core/test_qlinear.py         | 89 -------------------
 5 files changed, 8 insertions(+), 96 deletions(-)
 delete mode 100644 pymllm/backends/qualcomm/transformers/core/test_qlinear.py

diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
index f60ecc14d..3b7291931 100644
--- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
+++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
@@ -389,7 +389,7 @@ bool LLMQuantRecipeRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_pt
 
   // FIXME: This dtype is hardcoded. We should make it right.
   auto weight_spec_attr = writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(
-      ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536, kUInt16, kFloat32, kInt32, Tensor::nil(), Tensor::nil()));
+      ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536 - 1, kUInt16, kFloat32, kInt32, Tensor::nil(), Tensor::nil()));
   weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr);
 
   // Get self anno
@@ -767,7 +767,7 @@ bool LLMQuantRecipeLinearPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr
             ir::linalg::QuantizationSpecLPBQ::create(-8, 7, block_size, -1, 4, kUInt4, kFloat32, Tensor::nil(), Tensor::nil());
 
         // output sym int16
-        auto out_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536, kUInt16, kFloat32, kInt32,
+        auto out_quant_spec = ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65536 - 1, kUInt16, kFloat32, kInt32,
                                                                                 Tensor::nil(), Tensor::nil());
         linear_ir->outputs().front()->setAttr("quant_recipe",
                                               writer.create<ir::linalg::LinalgIRQuantizatonSpecAttr>(out_quant_spec));
@@ -867,7 +867,7 @@ bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_
     o_0->setAttr("quant_recipe", o_0_spec);
     annotation_attr->annotation_.outputs.emplace_back(o_0_spec->spec_);
   } else {
-    annotation_attr->annotation_.inputs.emplace_back(
+    annotation_attr->annotation_.outputs.emplace_back(
         o_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);
   }
 
diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp
index 8bd523641..ea95c28c7 100644
--- a/mllm/backends/qnn/aot/passes/PTQPass.cpp
+++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp
@@ -132,7 +132,7 @@ void recursiveSolveWeights(const std::shared_ptr<ir::IRContext>& ir_ctx, const i
   });
 }
 
-void __recursiveSolveNormalImpl(const ir::Val::ptr_t& v) {
+void _recursiveSolveNormalImpl(const ir::Val::ptr_t& v) {
   MLLM_RT_ASSERT(v->isa_<ir::tensor::TensorValue>());
   auto tv = v->cast_<ir::tensor::TensorValue>();
   MLLM_RT_ASSERT(tv->getAttr("quant_recipe"));
@@ -180,8 +180,8 @@ void recursiveSolveNormal(const std::shared_ptr<ir::IRContext>& ir_ctx, const ir
       auto inputs = op->inputs();
       auto outputs = op->outputs();
 
-      for (auto iii : inputs) { __recursiveSolveNormalImpl(iii->cast_<ir::Val>()); }
-      for (auto ooo : outputs) { __recursiveSolveNormalImpl(ooo->cast_<ir::Val>()); }
+      for (auto iii : inputs) { _recursiveSolveNormalImpl(iii->cast_<ir::Val>()); }
+      for (auto ooo : outputs) { _recursiveSolveNormalImpl(ooo->cast_<ir::Val>()); }
     }
 
     if (op->isa_<ir::graph::CallGraphOp>()) {
diff --git a/mllm/core/OpTypes.hpp b/mllm/core/OpTypes.hpp
index 849df8941..310b39cd0 100644
--- a/mllm/core/OpTypes.hpp
+++ b/mllm/core/OpTypes.hpp
@@ -180,6 +180,7 @@ inline std::string optype2Str(OpTypes type) {
     case OpTypes::kRadixAttnRelax: return "RadixAttnRelax";
     case OpTypes::kEqual: return "Equal";
     case OpTypes::kWhere: return "Where";
+    case OpTypes::kSigmoid: return "Sigmoid";
     case OpTypes::kDynamicOp_Start: return "DynamicOp_Start";
     case OpTypes::kOpType_End: return "OpType_End";
     default: return "Unknown";
diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
index ec6345d64..eb3d34b70 100644
--- a/pymllm/backends/qualcomm/transformers/core/rms_norm.py
+++ b/pymllm/backends/qualcomm/transformers/core/rms_norm.py
@@ -120,4 +120,4 @@ def disable_quant(self):
         self.weight_fake_quant.disable_fakequant()
 
     def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
diff --git a/pymllm/backends/qualcomm/transformers/core/test_qlinear.py b/pymllm/backends/qualcomm/transformers/core/test_qlinear.py
deleted file mode 100644
index 69edd69f6..000000000
--- a/pymllm/backends/qualcomm/transformers/core/test_qlinear.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import torch
-import torch.nn as nn
-from pymllm.backends.qualcomm.transformers.core.qlinear import QLinearLPBQ
-
-
-def test_qlinear_lpbq():
-    """
-    Test QLinearLPBQ implementation against bf16 baseline.
-
-    This test verifies that the double quantization implementation
-    produces results close to the bf16 baseline when using appropriate
-    quantization parameters.
-    """
-    # Set random seed for reproducibility
-    torch.manual_seed(42)
-
-    # Test parameters
-    in_features = 256
-    out_features = 128
-    batch_size = 4
-    seq_len = 16
-    block_size = 64
-
-    # Create input tensor (bf16 baseline)
-    x_bf16 = torch.randn(batch_size, seq_len, in_features, dtype=torch.bfloat16)
-
-    # Create reference linear layer (bf16)
-    linear_bf16 = nn.Linear(in_features, out_features, bias=True, dtype=torch.bfloat16)
-    # Copy weights and bias to ensure same values
-    with torch.no_grad():
-        linear_bf16.weight.copy_(
-            torch.randn(out_features, in_features, dtype=torch.bfloat16)
-        )
-        linear_bf16.bias.copy_(torch.zeros(out_features, dtype=torch.bfloat16))
-
-    # Get bf16 reference output
-    with torch.no_grad():
-        output_bf16 = linear_bf16(x_bf16)
-
-    # Create QLinearLPBQ with same weights
-    qlinear = QLinearLPBQ(
-        in_features=in_features,
-        out_features=out_features,
-        bias=True,
-        block_size=block_size,
-        already_quantized_weight=False,
-        already_quantized_activation=False,
-    )
-
-    # Copy the same weights and bias
-    with torch.no_grad():
-        qlinear.weight.copy_(linear_bf16.weight.data)
-        if qlinear.bias is not None:
-            qlinear.bias.copy_(linear_bf16.bias.data)
-
-    # Get quantized output
-    with torch.no_grad():
-        output_q = qlinear(x_bf16)
-    output_q_bf16 = output_q
-
-    # Calculate metrics
-    mse = torch.mean((output_bf16 - output_q_bf16) ** 2)
-    mae = torch.mean(torch.abs(output_bf16 - output_q_bf16))
-
-    # Calculate relative error
-    relative_error = torch.mean(
-        torch.abs(output_bf16 - output_q_bf16) / (torch.abs(output_bf16) + 1e-8)
-    )
-
-    # Print results
-    print("=== QLinearLPBQ Test Results ===")
-    print(f"Input shape: {x_bf16.shape}")
-    print(f"Output shape: {output_bf16.shape}")
-    print(f"Block size: {block_size}")
-    print("\nComparison with bf16 baseline:")
-    print(f"MSE: {mse:.6e}")
-    print(f"MAE: {mae:.6e}")
-    print(f"Relative Error: {relative_error:.6e}")
-
-    # Check if results are within acceptable tolerance
-    # For double quantization, we expect some error but should be reasonable
-    tolerance = 0.1  # 10% relative error tolerance
-
-    if relative_error < tolerance:
-        print(f"\n✓ TEST PASSED: Relative error {relative_error:.6e} < {tolerance}")
-        return True
-    else:
-        print(f"\n✗ TEST FAILED: Relative error {relative_error:.6e} >= {tolerance}")
-        return False

From fb4075936b89f64b17af1c5aca6d9478694cad4f Mon Sep 17 00:00:00 2001
From: chenghuaWang <2923277184@qq.com>
Date: Mon, 5 Jan 2026 11:10:26 +0000
Subject: [PATCH 13/13] fix: typos

---
 mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
index 3b7291931..37fdaffec 100644
--- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
+++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp
@@ -857,6 +857,8 @@ bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_
   if (!i_0->getAttr("quant_recipe")) {
     auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_<ir::tensor::TensorValue>());
     i_0->setAttr("quant_recipe", i_0_spec);
+    annotation_attr->annotation_.inputs.emplace_back(
+        i_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);
   } else {
     annotation_attr->annotation_.inputs.emplace_back(
         i_0->getAttr("quant_recipe")->cast_<ir::linalg::LinalgIRQuantizatonSpecAttr>()->spec_);