From 35caf8766ff21535649100d8639c8bc7303cf998 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Tue, 24 Jun 2025 10:53:31 -0700
Subject: [PATCH] Fix bug in le broadcast for single element broadcast

Summary:
Really comparison ops cannot be handled in the same way as other binary
ops for broadcasting because output tensor dtype is different than input
tensor dtype.

As a result we just have to fall back to portable.

Even the current vectorized impl for le, ge etc. assumes that the output
type of compare is same as input type. That might actually be a bug.

A potential way to handle this maybe via vectorized compare natively
supporting binary output vector

Test Plan:
Tests added which fail before and passes after

Reviewers:

Subscribers:

Tasks:

Tags:
---
 kernels/optimized/cpu/op_le.cpp | 49 ------------------------
 kernels/test/op_le_test.cpp     | 67 +++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 49 deletions(-)
diff --git a/kernels/optimized/cpu/op_le.cpp b/kernels/optimized/cpu/op_le.cpp
index 4aeeb69323f..8e56e1ca4fc 100644
--- a/kernels/optimized/cpu/op_le.cpp
+++ b/kernels/optimized/cpu/op_le.cpp
@@ -30,57 +30,8 @@ Tensor& opt_le_tensor_out(
   (void)ctx;
 
   ScalarType a_type = a.scalar_type();
-  ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a.numel() == 1 || b.numel() == 1) {
-    const Tensor* tensor;
-    const Tensor* scalar;
-    ScalarType tensor_type;
-    ScalarType scalar_type;
-    if (a.numel() == 1) {
-      tensor = &b;
-      tensor_type = b_type;
-      scalar = &a;
-      scalar_type = a_type;
-    } else {
-      tensor = &a;
-      tensor_type = a_type;
-      scalar = &b;
-      scalar_type = b_type;
-    }
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    constexpr auto name = "le.Tensor_out";
-
-    ET_SWITCH_REALB_TYPES(tensor_type, ctx, name, CTYPE, [&]() {
-      ET_SWITCH_REALB_TYPES(scalar_type, ctx, name, CTYPE_SCALAR, [&]() {
-        CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
-        CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);
-
-        using Vec = at::vec::Vectorized<CTYPE>;
-        if (a.numel() == 1) {
-          at::vec::map<CTYPE>(
-              [scalar_casted](Vec x) { return Vec(scalar_casted).le(x); },
-              out.mutable_data_ptr<CTYPE>(),
-              tensor->const_data_ptr<CTYPE>(),
-              out.numel());
-        } else {
-          at::vec::map<CTYPE>(
-              [scalar_casted](Vec x) { return x.le(Vec(scalar_casted)); },
-              out.mutable_data_ptr<CTYPE>(),
-              tensor->const_data_ptr<CTYPE>(),
-              out.numel());
-        }
-      });
-    });
-    return out;
-  }
-
   // Check for optimized broadcast paths
   auto selected_optimized_path = select_optimized_path(a, b, out);
   if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
diff --git a/kernels/test/op_le_test.cpp b/kernels/test/op_le_test.cpp
index d8ecec11c46..4a9b97dfe8a 100644
--- a/kernels/test/op_le_test.cpp
+++ b/kernels/test/op_le_test.cpp
@@ -1112,3 +1112,70 @@ TEST_F(OpLeTensorOutTest, Broadcast22dBy1dReverseTest) {
 
   EXPECT_TENSOR_EQ(out, tf_bool.make({3, 4}, expected_data));
 }
+
+TEST_F(OpLeTensorOutTest, MonotonicIncreasingVsScalarBroadcastTest) {
+  TensorFactory<ScalarType::Int> tf;
+  TensorFactory<ScalarType::Bool> tf_bool;
+
+  // Test case: 1D tensor [0, 1, 2, ..., 63] vs 2D tensor [1, 1] with value 2
+  std::vector<int32_t> lhs_data;
+  for (int i = 0; i < 64; ++i) {
+    lhs_data.push_back(i);
+  }
+
+  Tensor lhs = tf.make({64}, lhs_data);
+  Tensor rhs = tf.make({1, 1}, {2});
+  Tensor out = tf_bool.zeros({1, 64});
+
+  op_le_tensor_out(lhs, rhs, out);
+
+  // Expected: [0, 1, 2] <= 2 should be [true, true, true], rest false
+  using ctype =
+      executorch::runtime::testing::internal::ScalarTypeToCppTypeWrapper<
+          ScalarType::Bool>::ctype;
+  std::vector<ctype> expected_data;
+  for (int i = 0; i < 64; ++i) {
+    expected_data.push_back(i <= 2);
+  }
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({1, 64}, expected_data));
+
+  // Test with rhs value 4
+  rhs = tf.make({1, 1}, {4});
+  out = tf_bool.zeros({1, 64});
+
+  op_le_tensor_out(lhs, rhs, out);
+
+  expected_data.clear();
+  for (int i = 0; i < 64; ++i) {
+    expected_data.push_back(i <= 4);
+  }
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({1, 64}, expected_data));
+
+  // Test with rhs value 10
+  rhs = tf.make({1, 1}, {10});
+  out = tf_bool.zeros({1, 64});
+
+  op_le_tensor_out(lhs, rhs, out);
+
+  expected_data.clear();
+  for (int i = 0; i < 64; ++i) {
+    expected_data.push_back(i <= 10);
+  }
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({1, 64}, expected_data));
+
+  // Test with rhs value 32
+  rhs = tf.make({1, 1}, {32});
+  out = tf_bool.zeros({1, 64});
+
+  op_le_tensor_out(lhs, rhs, out);
+
+  expected_data.clear();
+  for (int i = 0; i < 64; ++i) {
+    expected_data.push_back(i <= 32);
+  }
+
+  EXPECT_TENSOR_EQ(out, tf_bool.make({1, 64}, expected_data));
+}