From e10716b05e14ce14862c5a97f5b034400dfa1600 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 7 May 2024 22:23:06 +0000
Subject: [PATCH 01/30] linear op ir node

---
 csrc/ir/internal_nodes.h | 43 +++++++++++++++++++++++
 csrc/ir/nodes.cpp        | 44 +++++++++++++++++++++++
 csrc/ops/composite.cpp   | 75 +++++++++++++++++++++++-----------------
 3 files changed, 130 insertions(+), 32 deletions(-)
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index ba9608909db..5125fdc8f94 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2288,4 +2288,47 @@ class MatmulOp : public Expr {
       const std::vector<PolymorphicValue>& inputs) const override;
 };
 
+//! Linear Operator to be expression evaluated without decomposition.
+class LinearOp : public Expr {
+ public:
+  using Expr::Expr;
+
+  LinearOp(IrBuilderPasskey, Val* out, Val* in_a, Val* in_b);
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  const char* getOpString() const override {
+    return "LinearOp";
+  }
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+
+  Val* out() const {
+    return output(0);
+  }
+
+  Val* inA() const {
+    return input(0);
+  }
+
+  Val* inB() const {
+    return input(1);
+  }
+
+  Val* bias() const {
+    if (has_bias_) {
+      return input(2);
+    } else {
+      return nullptr;
+    }
+  }
+
+  std::vector<PolymorphicValue> evaluate(
+      const ExpressionEvaluator& ee,
+      const std::vector<PolymorphicValue>& inputs) const override;
+};
+private:
+  bool has_bias_ = false;
+
 } // namespace nvfuser
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index 879504c07e2..db7d0c207b9 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -4501,4 +4501,48 @@ std::vector<PolymorphicValue> MatmulOp::evaluate(
   return {at::matmul(a, b)};
 }
 
+LinearOp::LinearOp(IrBuilderPasskey passkey, Val* out, Val* in_a, Val* in_b, Val* bias)
+    : Expr(passkey) {
+  addOutput(out);
+  addInput(in_a);
+  addInput(in_b);
+
+  if (bias != nullptr){
+    this->has_bias_ = true;
+    addInput(bias);
+  }
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(LinearOp)
+
+std::string LinearOp::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << out()->toString() << "\n";
+  indent(ss, indent_size + 1) << " = linear(" << inA()->toString() << ",\n";
+  indent(ss, indent_size + 1) << "       " << inB()->toString();
+  if (this->has_bias_){
+    indent(ss, indent_size + 1) << ",\n      " << bias()->toString(); 
+  }
+  indent(ss, indent_size + 1) << ")\n";
+  return ss.str();
+}
+
+std::string LinearOp::toInlineString(int indent_size) const {
+  NVF_CHECK(false, "Tensor op can not be printed inline");
+}
+
+std::vector<PolymorphicValue> LinearOp::evaluate(
+    const ExpressionEvaluator& ee,
+    const std::vector<PolymorphicValue>& inputs) const {
+  const auto a = inputs.at(0).as<at::Tensor>();
+  const auto b = inputs.at(1).as<at::Tensor>();
+
+  if (this->has_bias_) {
+    const auto bias = inputs.at(2).as<at::Tensor>();
+    return {at::linear(a, b, bias)};
+  }
+  return {at::linear(a, b)};
+}
+
+
 } // namespace nvfuser
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index cdeeaecb624..20a7a126bf1 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -54,42 +54,53 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) {
   return dx;
 }
 
-TensorView* linear(TensorView* a, TensorView* b, TensorView* bias) {
-  // TODO: Support 1+ dimensional A.
-  NVF_CHECK(
-      (a->nDims() == 2 && b->nDims() == 2),
-      "Only 2-D Inputs and Weights are currently supported in Linear!");
-
-  std::vector<bool> bcast_dims(a->nDims() + 1, false);
-  // A: [M, Bcast, K]
-  // B: [Bcast, N, K]
-  bcast_dims.at(bcast_dims.size() - 2) = true;
-  auto* tv0b = broadcast(a, bcast_dims);
-  bcast_dims.at(bcast_dims.size() - 2) = false;
-  bcast_dims.at(bcast_dims.size() - 3) = true;
-  auto* tv1b = broadcast(b, bcast_dims);
+namespace {
 
-  NVF_CHECK(
-      a->getDataType().value() == b->getDataType().value(),
-      "data types of inputs to matmul don't match");
-
-  auto* output = fusedMultiplySum(tv0b, tv1b, {-1});
-  if (bias) {
-    NVF_CHECK(
-        (bias->nDims() <= a->nDims()), "bias should be broadcastable to A");
-    NVF_CHECK(
-        a->getDataType().value() == bias->getDataType().value(),
-        "bias doesn't match input/weight dtype");
-    auto* bias_with_cast = maybeCastOp(output->getDataType().value(), bias);
-    auto* bcast_bias = ops::maybeBroadcast({output, bias_with_cast})[1];
-    auto* bias_output = add(output, bcast_bias);
-    return maybeCastOp(a->getDataType().value(), bias_output);
+static TensorView* newForLinear(TensorView* input, TensorView* weight, TensorView* bias) {
+  auto input_domain =
+      TensorDomain::noReductions(input->getMaybeRFactorDomain());
+
+  // Linear: inputs = {*, in_features}, weight = {out_features, in_features} / {in_features}
+  // For the linear output, all but the last dimension are the same shape as the input.
+  // The last dimension is out_features.
+
+  auto ndims_out = (input.size() - 1)+ (weight.size() - 1);
+  std::vector<IterDomain*> out_domain(ndims_out, nullptr);  
+
+  for (auto idx : c10::irange(ndims_out - 1)) {
+    out_domain[idx] = ops::newOutputIterDomain({input_domain.at(idx)});
+  }
+  if (weight.size() == 2){
+    // Add out_features to output domain.
+    auto weight_domain = TensorDomain::noReductions(weight->getMaybeRFactorDomain());
+    if (bias != nullptr) {
+      auto bias_domain = TensorDomain::noReductions(bias->getMaybeRFactorDomain());
+      out_domain[ndims_out - 1] = ops::newOutputIterDomain({weight_domain.at(0), bias_domain.at(0)});
+    } else {
+      out_domain[ndims_out - 1] = ops::newOutputIterDomain({weight_domain.at(0)});
+    }
   }
-  return maybeCastOp(a->getDataType().value(), output);
+
+  TensorDomain* td = IrBuilder::create<TensorDomain>(
+      out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
+
+  return IrBuilder::create<TensorView>(td, input->dtype());
+}
+
+} // namespace
+
+TensorView* linear(TensorView* tv_a, TensorView* tv_b, TensorView* bias) {
+  NVF_CHECK(tv_a->nDims() >= 1);
+  NVF_CHECK(tv_b->nDims() == 1 || tv_b->nDims() == 2);
+
+  // For all other cases, create a new LinearOp
+  TensorView* out = newForLinear(tv_a, tv_b, bias);
+  IrBuilder::create<LinearOp>(out, tv_a, tv_b, bias);
+  return out;
 }
 
-TensorView* linear(TensorView* a, TensorView* b) {
-  return linear(a, b, nullptr /*bias*/);
+TensorView* linear(TensorView* tv_a, TensorView* tv_b) {
+  return linear(tv_a, tv_b, /*bias=*/nullptr);
 }
 
 LstmResult lstm(

From f5fa9558e2271cf13de8713d8159e9f7b05ce982 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 00:17:51 +0000
Subject: [PATCH 02/30] add linear op to dispatch

---
 csrc/dispatch.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csrc/dispatch.h b/csrc/dispatch.h
index c2150dda35e..714b37c3e15 100644
--- a/csrc/dispatch.h
+++ b/csrc/dispatch.h
@@ -108,6 +108,7 @@ class Val;
   f(Swizzle2D);                   \
   f(Resize);                      \
   f(MatmulOp);                    \
+  f(LinearOp);                    \
   f(Communication);
 #define DISPATCH_FOR_ALL_KIR_EXPRS(f) \
   f(Allocate);                        \

From 1bd774ff946a56f4591a984e685390e60b4a2146 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 02:29:31 +0000
Subject: [PATCH 03/30] add bias, test

---
 csrc/ir/internal_nodes.h                  |  13 ++-
 csrc/ir/nodes.cpp                         |   9 +-
 csrc/ops/composite.cpp                    |  18 ++--
 tests/cpp/test_matmul_aten_evaluation.cpp | 126 ++++++++++++++++++++++
 4 files changed, 148 insertions(+), 18 deletions(-)

diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 5125fdc8f94..9dbf80dbe5c 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2293,7 +2293,7 @@ class LinearOp : public Expr {
  public:
   using Expr::Expr;
 
-  LinearOp(IrBuilderPasskey, Val* out, Val* in_a, Val* in_b);
+  LinearOp(IrBuilderPasskey, Val* out, Val* in_a, Val* in_b, Val* bias);
 
   NVFUSER_DECLARE_CLONE_AND_CREATE
 
@@ -2317,7 +2317,7 @@ class LinearOp : public Expr {
   }
 
   Val* bias() const {
-    if (has_bias_) {
+    if (has_bias()) {
       return input(2);
     } else {
       return nullptr;
@@ -2327,8 +2327,13 @@ class LinearOp : public Expr {
   std::vector<PolymorphicValue> evaluate(
       const ExpressionEvaluator& ee,
       const std::vector<PolymorphicValue>& inputs) const override;
-};
+
 private:
-  bool has_bias_ = false;
+  bool has_bias() const {
+    return inputs().size() == 3;
+  }
+
+};
+
 
 } // namespace nvfuser
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index db7d0c207b9..a8062afad8e 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -4508,7 +4508,6 @@ LinearOp::LinearOp(IrBuilderPasskey passkey, Val* out, Val* in_a, Val* in_b, Val
   addInput(in_b);
 
   if (bias != nullptr){
-    this->has_bias_ = true;
     addInput(bias);
   }
 }
@@ -4519,9 +4518,9 @@ std::string LinearOp::toString(int indent_size) const {
   std::stringstream ss;
   indent(ss, indent_size) << out()->toString() << "\n";
   indent(ss, indent_size + 1) << " = linear(" << inA()->toString() << ",\n";
-  indent(ss, indent_size + 1) << "       " << inB()->toString();
-  if (this->has_bias_){
-    indent(ss, indent_size + 1) << ",\n      " << bias()->toString(); 
+  indent(ss, indent_size + 1) << "          " << inB()->toString();
+  if (has_bias()){
+    indent(ss, indent_size + 1) << ",\n          " << bias()->toString(); 
   }
   indent(ss, indent_size + 1) << ")\n";
   return ss.str();
@@ -4537,7 +4536,7 @@ std::vector<PolymorphicValue> LinearOp::evaluate(
   const auto a = inputs.at(0).as<at::Tensor>();
   const auto b = inputs.at(1).as<at::Tensor>();
 
-  if (this->has_bias_) {
+  if (has_bias()) {
     const auto bias = inputs.at(2).as<at::Tensor>();
     return {at::linear(a, b, bias)};
   }
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 20a7a126bf1..db5afa70a86 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -57,22 +57,22 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) {
 namespace {
 
 static TensorView* newForLinear(TensorView* input, TensorView* weight, TensorView* bias) {
-  auto input_domain =
-      TensorDomain::noReductions(input->getMaybeRFactorDomain());
-
+  auto input_domain = TensorDomain::noReductions(input->getMaybeRFactorDomain());
+  auto weight_domain = TensorDomain::noReductions(weight->getMaybeRFactorDomain());
+  
   // Linear: inputs = {*, in_features}, weight = {out_features, in_features} / {in_features}
   // For the linear output, all but the last dimension are the same shape as the input.
-  // The last dimension is out_features.
-
-  auto ndims_out = (input.size() - 1)+ (weight.size() - 1);
+  // The last dimension is out_features (if present).
+  auto ndims_out = (input_domain.size() - 1)+ (weight_domain.size() - 1);
+  
   std::vector<IterDomain*> out_domain(ndims_out, nullptr);  
 
-  for (auto idx : c10::irange(ndims_out - 1)) {
+  for (auto idx : c10::irange(input_domain.size() - 1)) {
     out_domain[idx] = ops::newOutputIterDomain({input_domain.at(idx)});
   }
-  if (weight.size() == 2){
+
+  if (weight_domain.size() == 2){
     // Add out_features to output domain.
-    auto weight_domain = TensorDomain::noReductions(weight->getMaybeRFactorDomain());
     if (bias != nullptr) {
       auto bias_domain = TensorDomain::noReductions(bias->getMaybeRFactorDomain());
       out_domain[ndims_out - 1] = ops::newOutputIterDomain({weight_domain.at(0), bias_domain.at(0)});
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 94da409b504..9063d2904ec 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -47,6 +47,19 @@ class ATenNodesParametrizedTest
       optimization_guard_;
 };
 
+using LinearNodeParamType = std::tuple<Sizes, Sizes, std::optional<Sizes>>;
+class LinearNodeParametrizedTest
+    : public NVFuserFixtureParamTest<LinearNodeParamType> {
+ protected:
+  // Allocation order set by the pass breaks matmul tests
+  // see issue https://github.com/NVIDIA/Fuser/issues/1810
+  LinearNodeParametrizedTest() : optimization_guard_(false) {}
+
+ private:
+  preseg_passes::OptimizationPassGuard<preseg_passes::AllocationDomainPass>
+      optimization_guard_;
+};
+
 // fd.ops.matmul (a, b) where a = [M,K], b = [K,N]
 TEST_F(MatmulATenEvaluationTest, MmaOpAndCast) {
   auto fusion = std::make_unique<Fusion>();
@@ -552,6 +565,101 @@ TEST_P(ATenNodesParametrizedTest, MatmulNodeSymbolic) {
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 
+TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto& [a_shape, b_shape, bias_shape] = GetParam();
+
+  auto tv0 = makeConcreteTensor(a_shape, DataType::Half);
+  auto tv1 = makeConcreteTensor(b_shape, DataType::Half);
+  TensorView* bias = nullptr;
+  if (bias_shape.has_value()){
+    bias = makeConcreteTensor(*bias_shape, DataType::Half);
+  }
+  auto tv2 = linear(tv0, tv1, bias);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  if (bias_shape.has_value()){
+    fusion->addInput(bias);
+  }
+  fusion->addOutput(tv2);
+
+  at::Tensor t0 = at::randn(a_shape, at::kHalf).cuda();
+  at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
+  std::optional<at::Tensor> bias_opt = std::nullopt;
+  if (bias_shape.has_value()) {
+    bias_opt = at::randn(*bias_shape, at::kHalf).cuda();
+  }
+  at::Tensor out_ref = at::linear(t0, t1, bias_opt);
+
+  FusionExecutor fe;
+  fusion->aliasOutputToInput(
+      fusion->outputs()[0], /*input=*/nullptr, AllocationType::Evaluate);
+
+  std::vector<at::Tensor> out = {};
+  if (bias_shape.has_value()){
+    fe.compileFusion(fusion.get(), {t0, t1, bias_opt});
+    out = fe.runFusion({t0, t1, bias_opt});
+  } else {
+    fe.compileFusion(fusion.get(), {t0, t1});
+    out = fe.runFusion({t0, t1});
+  }
+
+  // Verify that fusion compilation was skipped.
+  EXPECT_FALSE(fe.hasCompiledKernel());
+  EXPECT_TRUE(at::allclose(out[0], out_ref));
+}
+TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto& [a_shape, b_shape, bias_shape] = GetParam();
+  
+  auto tv0 = makeSymbolicTensor(a_shape.size(), DataType::Half);
+  auto tv1 = makeSymbolicTensor(b_shape.size(), DataType::Half);
+
+  TensorView* bias = nullptr;
+  if (bias_shape.has_value()){
+    bias = makeSymbolicTensor(*bias_shape, DataType::Half);
+  }
+
+  auto tv2 = linear(tv0, tv1, bias);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  if (bias_shape.has_value()){
+    fusion->addInput(bias);
+  }
+  fusion->addOutput(tv2);
+
+  at::Tensor t0 = at::randn(a_shape, at::kHalf).cuda();
+  at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
+  std::optional<at::Tensor> bias_opt = std::nullopt;
+  if (bias_shape.has_value()) {
+    bias_opt = at::randn(*bias_shape, at::kHalf).cuda();
+  }
+  at::Tensor out_ref = at::linear(t0, t1, bias_opt);
+
+  FusionExecutor fe;
+  fusion->aliasOutputToInput(
+      fusion->outputs()[0], /*input=*/nullptr, AllocationType::Evaluate);
+
+  std::vector<at::Tensor> out = {};
+  if (bias_shape.has_value()){
+    fe.compileFusion(fusion.get(), {t0, t1, bias_opt});
+    out = fe.runFusion({t0, t1, bias_opt});
+  } else {
+    fe.compileFusion(fusion.get(), {t0, t1});
+    out = fe.runFusion({t0, t1});
+  }
+
+  // Verify that fusion compilation was skipped.
+  EXPECT_FALSE(fe.hasCompiledKernel());
+  EXPECT_TRUE(at::allclose(out[0], out_ref));
+}
+
 constexpr int64_t b = 128, m = 64, k = 32, n = 16;
 
 // Parametrize a_shape and b_shape
@@ -588,4 +696,22 @@ INSTANTIATE_TEST_SUITE_P(
             Sizes({1, 1}),
             Sizes({b, 1, n}))));
 
+INSTANTIATE_TEST_SUITE_P(
+    LinearWithoutBias,
+    LinearNodeParametrizedTest,
+    testing::Combine(
+      testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
+      testing::Values(Sizes({k}), Sizes({n, k})),
+      testing::Values(std::nullopt)
+    ));
+
+INSTANTIATE_TEST_SUITE_P(
+    LinearWithBias,
+    LinearNodeParametrizedTest,
+    testing::Combine(
+      testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})), 
+      testing::Values(Sizes({n, k})), 
+      testing::Values(Sizes({n}))
+    ));
+
 } // namespace nvfuser

From f189d6b2a7eff03fa747eef39f718e49a7f867aa Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 04:27:43 +0000
Subject: [PATCH 04/30] add linear op to scheduler

---
 csrc/root_domain_map.cpp                  | 42 +++++++++++++++++++++++
 csrc/scheduler/expr_eval_sched.cpp        |  6 ++--
 tests/cpp/test_matmul_aten_evaluation.cpp | 36 +++++++++----------
 tests/python/pytest_input_generators.py   | 29 ++++++++--------
 4 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index 0f6e01ded31..d4e8edc2572 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -198,6 +198,48 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     return dom_map;
   }
 
+  // For LinearOp, all but the last dimension are the same shape as the input.
+  // The last dimension is out_features (if present).
+  if (LinearOp* op = dynamic_cast<LinearOp*>(consumer_tv_->definition())) {
+    auto out_size = consumer_root.size();
+
+    // Check if the producer is A, B or bias.
+    MatmulRole input_role;
+    if (producer->sameAs(op->inA()->as<TensorView>()->domain())) {
+      input_role = MatmulRole::INPUT_A;
+    } else if (producer->sameAs(op->inB()->as<TensorView>()->domain())) {
+      input_role = MatmulRole::INPUT_B;
+    } else {
+      input_role = MatmulRole::INPUT_C;
+    }
+
+    switch (input_role) {
+      case MatmulRole::INPUT_A: {
+        for (auto inx : c10::irange(producer_root.size() - 1)) {
+          updatePairwiseRootDomainMap(
+            producer_root.at(inx),
+            consumer_root.at(inx));
+        }
+        break;
+      }
+      case MatmulRole::INPUT_B: {
+        if (producer_root.size() == 1) {
+          // out_features is not present, no mapping required.
+          break;
+        }
+      }
+      case MatmulRole::INPUT_C: {
+        updatePairwiseRootDomainMap(
+            producer_root.at(0),
+            consumer_root.at(out_size - 1));
+        break;
+      }
+      default:
+        NVF_ERROR("Unexpected input type.");
+    }
+    return dom_map;
+  }
+
   size_t itc = 0, itp = 0;
   while (itc < consumer_root.size() && itp < producer_root.size()) {
     IterDomain* producer_id = producer_root.at(itp);
diff --git a/csrc/scheduler/expr_eval_sched.cpp b/csrc/scheduler/expr_eval_sched.cpp
index c600b2f0bea..a16acbf8f43 100644
--- a/csrc/scheduler/expr_eval_sched.cpp
+++ b/csrc/scheduler/expr_eval_sched.cpp
@@ -13,15 +13,15 @@
 
 namespace nvfuser {
 
-// Check if the fusion has a single MatmulOp node
+// Check if the fusion has a single MatmulOp/LinearOp node
 bool ExprEvalScheduler::canScheduleCompileTime(Fusion* fusion) {
   auto exprs = fusion->exprs();
-  if (exprs.size() == 1 && exprs.front()->isA<MatmulOp>()) {
+  if (exprs.size() == 1 && (exprs.front()->isA<MatmulOp>() || exprs.front()->isA<LinearOp>())) {
     return true;
   }
   scheduler_debug_utils::canScheduleRejectReason(
       heuristicType(),
-      "Fusion must contain a single expression of type MatmulOp");
+      "Fusion must contain a single expression of type MatmulOp or LinearOp");
   return false;
 }
 
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 9063d2904ec..09a3b48f1ac 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -594,21 +594,21 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
-  FusionExecutor fe;
-  fusion->aliasOutputToInput(
-      fusion->outputs()[0], /*input=*/nullptr, AllocationType::Evaluate);
-
+  FusionExecutorCache fec(std::move(fusion));
+  
   std::vector<at::Tensor> out = {};
   if (bias_shape.has_value()){
-    fe.compileFusion(fusion.get(), {t0, t1, bias_opt});
-    out = fe.runFusion({t0, t1, bias_opt});
+    out = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
-    fe.compileFusion(fusion.get(), {t0, t1});
-    out = fe.runFusion({t0, t1});
+    out = fec.runFusionWithInputs({t0, t1});
   }
 
+  const std::vector<FusionExecutor>& executors =
+      fec.getMostRecentKernelRuntime()->executors();
+  EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
-  EXPECT_FALSE(fe.hasCompiledKernel());
+  EXPECT_FALSE(executors.front().hasCompiledKernel());
+  
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
@@ -642,21 +642,21 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
-  FusionExecutor fe;
-  fusion->aliasOutputToInput(
-      fusion->outputs()[0], /*input=*/nullptr, AllocationType::Evaluate);
-
+  FusionExecutorCache fec(std::move(fusion));
+  
   std::vector<at::Tensor> out = {};
   if (bias_shape.has_value()){
-    fe.compileFusion(fusion.get(), {t0, t1, bias_opt});
-    out = fe.runFusion({t0, t1, bias_opt});
+    out = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
-    fe.compileFusion(fusion.get(), {t0, t1});
-    out = fe.runFusion({t0, t1});
+    out = fec.runFusionWithInputs({t0, t1});
   }
 
+  const std::vector<FusionExecutor>& executors =
+      fec.getMostRecentKernelRuntime()->executors();
+  EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
-  EXPECT_FALSE(fe.hasCompiledKernel());
+  EXPECT_FALSE(executors.front().hasCompiledKernel());
+  
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 
diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index 8237381d6ed..c776afbba87 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1524,18 +1524,19 @@ def linear_input_generator(
         requires_grad=requires_grad,
     )
 
-    def multiply_range(maximum, step):
-        assert maximum % step == 0
-        num_steps = int(math.log(maximum, step))
-        return tuple(
-            map(pow, itertools.repeat(step, num_steps), range(1, num_steps + 1))
-        )
+    B = 64
+    M = 512
+    N = 256
+    K = 32
 
-    # Ranges of tensor sizes: 8, 64, 512, 4096, 32768, ...
-    # Use a Cartesian product to create a wide range of matrix shapes
-    # I'll stop at 512 as possible numerical difference may show up.
-    M, N, K = itertools.repeat(multiply_range(512, 8), 3)
-    for M, N, K in itertools.product(M, N, K):
-        lhs_shape = (M, K)
-        rhs_shape = (N, K)
-        yield (SampleInput(make_arg(lhs_shape), make_arg(rhs_shape), make_arg((N,))))
+    # Cases without bias
+    shapes_input = ((K), (M, K), (B, M, K))
+    # shapes_weight = ((K), (N, K))
+    # for shape_input, shape_weight in itertools.product(shapes_input, shapes_weight):
+    #     yield SampleInput(make_arg(shape_input), make_arg(shape_weight))
+
+    # Cases with bias
+    shape_weight = (N, K)
+    shape_bias = (N,)
+    for shape_input in shapes_input:
+        yield SampleInput(make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias))

From d60023a2b9ccd42e90067d470ffa572cfc320194 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 04:57:28 +0000
Subject: [PATCH 05/30] allow variable number of inputs in input generators

---
 tests/python/pytest_fusion_definitions.py | 12 ++++++++----
 tests/python/pytest_input_generators.py   | 10 ++++++----
 tests/python/pytest_ops.py                | 12 ++++++++----
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/tests/python/pytest_fusion_definitions.py b/tests/python/pytest_fusion_definitions.py
index d25448510c9..5e4d164584c 100644
--- a/tests/python/pytest_fusion_definitions.py
+++ b/tests/python/pytest_fusion_definitions.py
@@ -21,14 +21,18 @@ def parse_inputs_fusion_definition(fd: FusionDefinition, opinfo: OpInfo, *args):
 
     nvf_args = []
 
-    if opinfo.symbolic_parameter_list is None:
-        opinfo.symbolic_parameter_list = [ArgumentType.Symbolic] * len(args)
-    num_symbolic_parameters = len(opinfo.symbolic_parameter_list)
+    symbolic_parameter_list = (
+        opinfo.symbolic_parameter_list
+        if opinfo.symbolic_parameter_list is not None
+        else [ArgumentType.Symbolic] * len(args)
+    )
+
+    num_symbolic_parameters = len(symbolic_parameter_list)
     assert num_symbolic_parameters == len(
         args
     ), f"{num_symbolic_parameters} vs {len(args)}"
 
-    for arg_type, a in zip(opinfo.symbolic_parameter_list, args):
+    for arg_type, a in zip(symbolic_parameter_list, args):
         if arg_type == ArgumentType.Symbolic:
             if isinstance(a, torch.Tensor):
                 nvf_args.append(fd.from_pytorch(a))
diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index c776afbba87..aa29430c751 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1531,12 +1531,14 @@ def linear_input_generator(
 
     # Cases without bias
     shapes_input = ((K), (M, K), (B, M, K))
-    # shapes_weight = ((K), (N, K))
-    # for shape_input, shape_weight in itertools.product(shapes_input, shapes_weight):
-    #     yield SampleInput(make_arg(shape_input), make_arg(shape_weight))
+    shapes_weight = ((K), (N, K))
+    for shape_input, shape_weight in itertools.product(shapes_input, shapes_weight):
+        yield SampleInput(make_arg(shape_input), make_arg(shape_weight))
 
     # Cases with bias
     shape_weight = (N, K)
     shape_bias = (N,)
     for shape_input in shapes_input:
-        yield SampleInput(make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias))
+        yield SampleInput(
+            make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)
+        )
diff --git a/tests/python/pytest_ops.py b/tests/python/pytest_ops.py
index 713f7dc35ef..ffcb8e7d535 100644
--- a/tests/python/pytest_ops.py
+++ b/tests/python/pytest_ops.py
@@ -26,12 +26,16 @@ def parse_args_fusion_execution(opinfo: OpInfo, *args):
     if len(args) == 0:
         return []
 
-    if opinfo.symbolic_parameter_list is None:
-        opinfo.symbolic_parameter_list = [ArgumentType.Symbolic] * len(args)
-    assert len(opinfo.symbolic_parameter_list) == len(args)
+    symbolic_parameter_list = (
+        opinfo.symbolic_parameter_list
+        if opinfo.symbolic_parameter_list is not None
+        else [ArgumentType.Symbolic] * len(args)
+    )
+
+    assert len(symbolic_parameter_list) == len(args)
 
     result = []
-    for arg_type, a in zip(opinfo.symbolic_parameter_list, args):
+    for arg_type, a in zip(symbolic_parameter_list, args):
         if arg_type == ArgumentType.Symbolic:
             if isinstance(a, list) and all(map(is_tensor, a)):
                 result.extend(a)

From 69c1eda6426a686a78c228eac9d3e94eadb0d268 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 05:01:25 +0000
Subject: [PATCH 06/30] format

---
 csrc/ir/internal_nodes.h                  |  4 +-
 csrc/ir/nodes.cpp                         | 14 ++++--
 csrc/ops/composite.cpp                    | 36 +++++++------
 csrc/root_domain_map.cpp                  |  6 +--
 csrc/scheduler/expr_eval_sched.cpp        |  3 +-
 tests/cpp/test_matmul_aten_evaluation.cpp | 61 ++++++++++++++---------
 6 files changed, 74 insertions(+), 50 deletions(-)

diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 9dbf80dbe5c..38892ef581b 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2328,12 +2328,10 @@ class LinearOp : public Expr {
       const ExpressionEvaluator& ee,
       const std::vector<PolymorphicValue>& inputs) const override;
 
-private:
+ private:
   bool has_bias() const {
     return inputs().size() == 3;
   }
-
 };
 
-
 } // namespace nvfuser
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index a8062afad8e..e3369dad1b7 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -4501,13 +4501,18 @@ std::vector<PolymorphicValue> MatmulOp::evaluate(
   return {at::matmul(a, b)};
 }
 
-LinearOp::LinearOp(IrBuilderPasskey passkey, Val* out, Val* in_a, Val* in_b, Val* bias)
+LinearOp::LinearOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* in_a,
+    Val* in_b,
+    Val* bias)
     : Expr(passkey) {
   addOutput(out);
   addInput(in_a);
   addInput(in_b);
 
-  if (bias != nullptr){
+  if (bias != nullptr) {
     addInput(bias);
   }
 }
@@ -4519,8 +4524,8 @@ std::string LinearOp::toString(int indent_size) const {
   indent(ss, indent_size) << out()->toString() << "\n";
   indent(ss, indent_size + 1) << " = linear(" << inA()->toString() << ",\n";
   indent(ss, indent_size + 1) << "          " << inB()->toString();
-  if (has_bias()){
-    indent(ss, indent_size + 1) << ",\n          " << bias()->toString(); 
+  if (has_bias()) {
+    indent(ss, indent_size + 1) << ",\n          " << bias()->toString();
   }
   indent(ss, indent_size + 1) << ")\n";
   return ss.str();
@@ -4543,5 +4548,4 @@ std::vector<PolymorphicValue> LinearOp::evaluate(
   return {at::linear(a, b)};
 }
 
-
 } // namespace nvfuser
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index db5afa70a86..f36d57ba3c1 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -56,28 +56,36 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) {
 
 namespace {
 
-static TensorView* newForLinear(TensorView* input, TensorView* weight, TensorView* bias) {
-  auto input_domain = TensorDomain::noReductions(input->getMaybeRFactorDomain());
-  auto weight_domain = TensorDomain::noReductions(weight->getMaybeRFactorDomain());
-  
-  // Linear: inputs = {*, in_features}, weight = {out_features, in_features} / {in_features}
-  // For the linear output, all but the last dimension are the same shape as the input.
-  // The last dimension is out_features (if present).
-  auto ndims_out = (input_domain.size() - 1)+ (weight_domain.size() - 1);
-  
-  std::vector<IterDomain*> out_domain(ndims_out, nullptr);  
+static TensorView* newForLinear(
+    TensorView* input,
+    TensorView* weight,
+    TensorView* bias) {
+  auto input_domain =
+      TensorDomain::noReductions(input->getMaybeRFactorDomain());
+  auto weight_domain =
+      TensorDomain::noReductions(weight->getMaybeRFactorDomain());
+
+  // Linear: inputs = {*, in_features}, weight = {out_features, in_features} /
+  // {in_features} For the linear output, all but the last dimension are the
+  // same shape as the input. The last dimension is out_features (if present).
+  auto ndims_out = (input_domain.size() - 1) + (weight_domain.size() - 1);
+
+  std::vector<IterDomain*> out_domain(ndims_out, nullptr);
 
   for (auto idx : c10::irange(input_domain.size() - 1)) {
     out_domain[idx] = ops::newOutputIterDomain({input_domain.at(idx)});
   }
 
-  if (weight_domain.size() == 2){
+  if (weight_domain.size() == 2) {
     // Add out_features to output domain.
     if (bias != nullptr) {
-      auto bias_domain = TensorDomain::noReductions(bias->getMaybeRFactorDomain());
-      out_domain[ndims_out - 1] = ops::newOutputIterDomain({weight_domain.at(0), bias_domain.at(0)});
+      auto bias_domain =
+          TensorDomain::noReductions(bias->getMaybeRFactorDomain());
+      out_domain[ndims_out - 1] =
+          ops::newOutputIterDomain({weight_domain.at(0), bias_domain.at(0)});
     } else {
-      out_domain[ndims_out - 1] = ops::newOutputIterDomain({weight_domain.at(0)});
+      out_domain[ndims_out - 1] =
+          ops::newOutputIterDomain({weight_domain.at(0)});
     }
   }
 
diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index d4e8edc2572..b6ca1730f74 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -217,8 +217,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
       case MatmulRole::INPUT_A: {
         for (auto inx : c10::irange(producer_root.size() - 1)) {
           updatePairwiseRootDomainMap(
-            producer_root.at(inx),
-            consumer_root.at(inx));
+              producer_root.at(inx), consumer_root.at(inx));
         }
         break;
       }
@@ -230,8 +229,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
       }
       case MatmulRole::INPUT_C: {
         updatePairwiseRootDomainMap(
-            producer_root.at(0),
-            consumer_root.at(out_size - 1));
+            producer_root.at(0), consumer_root.at(out_size - 1));
         break;
       }
       default:
diff --git a/csrc/scheduler/expr_eval_sched.cpp b/csrc/scheduler/expr_eval_sched.cpp
index a16acbf8f43..b25a290ce99 100644
--- a/csrc/scheduler/expr_eval_sched.cpp
+++ b/csrc/scheduler/expr_eval_sched.cpp
@@ -16,7 +16,8 @@ namespace nvfuser {
 // Check if the fusion has a single MatmulOp/LinearOp node
 bool ExprEvalScheduler::canScheduleCompileTime(Fusion* fusion) {
   auto exprs = fusion->exprs();
-  if (exprs.size() == 1 && (exprs.front()->isA<MatmulOp>() || exprs.front()->isA<LinearOp>())) {
+  if (exprs.size() == 1 &&
+      (exprs.front()->isA<MatmulOp>() || exprs.front()->isA<LinearOp>())) {
     return true;
   }
   scheduler_debug_utils::canScheduleRejectReason(
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 09a3b48f1ac..f95e9fc4dbd 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -574,14 +574,14 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   auto tv0 = makeConcreteTensor(a_shape, DataType::Half);
   auto tv1 = makeConcreteTensor(b_shape, DataType::Half);
   TensorView* bias = nullptr;
-  if (bias_shape.has_value()){
+  if (bias_shape.has_value()) {
     bias = makeConcreteTensor(*bias_shape, DataType::Half);
   }
   auto tv2 = linear(tv0, tv1, bias);
 
   fusion->addInput(tv0);
   fusion->addInput(tv1);
-  if (bias_shape.has_value()){
+  if (bias_shape.has_value()) {
     fusion->addInput(bias);
   }
   fusion->addOutput(tv2);
@@ -595,10 +595,10 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
   FusionExecutorCache fec(std::move(fusion));
-  
-  std::vector<at::Tensor> out = {};
+
+  d::vector<at::Tensor> out = {};
   if (bias_shape.has_value()){
-    out = fec.runFusionWithInputs({t0, t1, bias_opt});
+     out = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
     out = fec.runFusionWithInputs({t0, t1});
   }
@@ -608,21 +608,25 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
-  
-  EXPECT_TRUE(at::allclose(out[0], out_ref));
+
+
+
+  CT_TRUE(at::allclose(out[0], out_ref));
 }
 TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
   const auto& [a_shape, b_shape, bias_shape] = GetParam();
+
   
-  auto tv0 = makeSymbolicTensor(a_shape.size(), DataType::Half);
+
+  v0 = makeSymbolicTensor(a_shape.size(), DataType::Half);
   auto tv1 = makeSymbolicTensor(b_shape.size(), DataType::Half);
 
   TensorView* bias = nullptr;
   if (bias_shape.has_value()){
-    bias = makeSymbolicTensor(*bias_shape, DataType::Half);
+     bias = makeSymbolicTensor(*bias_shape, DataType::Half);
   }
 
   auto tv2 = linear(tv0, tv1, bias);
@@ -630,7 +634,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   fusion->addInput(tv0);
   fusion->addInput(tv1);
   if (bias_shape.has_value()){
-    fusion->addInput(bias);
+     fusion->addInput(bias);
   }
   fusion->addOutput(tv2);
 
@@ -643,10 +647,12 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
   FusionExecutorCache fec(std::move(fusion));
-  
-  std::vector<at::Tensor> out = {};
+
+  st
+
+  tor<at::Tensor> out = {};
   if (bias_shape.has_value()){
-    out = fec.runFusionWithInputs({t0, t1, bias_opt});
+    ou t = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
     out = fec.runFusionWithInputs({t0, t1});
   }
@@ -656,8 +662,10 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
-  
-  EXPECT_TRUE(at::allclose(out[0], out_ref));
+
+  EXPE
+
+  E(at::allclose(out[0], out_ref));
 }
 
 constexpr int64_t b = 128, m = 64, k = 32, n = 16;
@@ -700,18 +708,25 @@ INSTANTIATE_TEST_SUITE_P(
     LinearWithoutBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-      testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
-      testing::Values(Sizes({k}), Sizes({n, k})),
-      testing::Values(std::nullopt)
+      tes
+        alues(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
+      tes
+        alues(Sizes({k}), Sizes({n, k})),
+      tes
+        alues(std::nullopt)
     ));
 
-INSTANTIATE_TEST_SUITE_P(
+NTIATE_TEST_SUITE_P(
     LinearWithBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-      testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})), 
-      testing::Values(Sizes({n, k})), 
-      testing::Values(Sizes({n}))
+      tes
+        alues(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
+      tes
+        lues(Sizes({n, k})),
+      test
+        ues(Sizes({n}))
     ));
 
-} // namespace nvfuser
+} mespace nvfuser
+            
\ No newline at end of file

From 234f0debba7faf17cf98a7a70f757b3646dc1027 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 05:06:06 +0000
Subject: [PATCH 07/30] fix formatting

---
 tests/cpp/test_matmul_aten_evaluation.cpp | 55 ++++++++---------------
 1 file changed, 19 insertions(+), 36 deletions(-)

diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index f95e9fc4dbd..c7254461d8d 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -597,8 +597,8 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   FusionExecutorCache fec(std::move(fusion));
 
   d::vector<at::Tensor> out = {};
-  if (bias_shape.has_value()){
-     out = fec.runFusionWithInputs({t0, t1, bias_opt});
+  if (bias_shape.has_value()) {
+    out = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
     out = fec.runFusionWithInputs({t0, t1});
   }
@@ -609,9 +609,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
 
-
-
-  CT_TRUE(at::allclose(out[0], out_ref));
+  EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   auto fusion = std::make_unique<Fusion>();
@@ -619,22 +617,20 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
 
   const auto& [a_shape, b_shape, bias_shape] = GetParam();
 
-  
-
-  v0 = makeSymbolicTensor(a_shape.size(), DataType::Half);
+  auto tv0 = makeSymbolicTensor(a_shape.size(), DataType::Half);
   auto tv1 = makeSymbolicTensor(b_shape.size(), DataType::Half);
 
   TensorView* bias = nullptr;
-  if (bias_shape.has_value()){
-     bias = makeSymbolicTensor(*bias_shape, DataType::Half);
+  if (bias_shape.has_value()) {
+    bias = makeSymbolicTensor(*bias_shape, DataType::Half);
   }
 
   auto tv2 = linear(tv0, tv1, bias);
 
   fusion->addInput(tv0);
   fusion->addInput(tv1);
-  if (bias_shape.has_value()){
-     fusion->addInput(bias);
+  if (bias_shape.has_value()) {
+    fusion->addInput(bias);
   }
   fusion->addOutput(tv2);
 
@@ -648,10 +644,8 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
 
   FusionExecutorCache fec(std::move(fusion));
 
-  st
-
-  tor<at::Tensor> out = {};
-  if (bias_shape.has_value()){
+  std::vector<at::Tensor> out = {};
+  if (bias_shape.has_value()) {
     ou t = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
     out = fec.runFusionWithInputs({t0, t1});
@@ -663,9 +657,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
 
-  EXPE
-
-  E(at::allclose(out[0], out_ref));
+  EXPECT(at::allclose(out[0], out_ref));
 }
 
 constexpr int64_t b = 128, m = 64, k = 32, n = 16;
@@ -708,25 +700,16 @@ INSTANTIATE_TEST_SUITE_P(
     LinearWithoutBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-      tes
-        alues(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
-      tes
-        alues(Sizes({k}), Sizes({n, k})),
-      tes
-        alues(std::nullopt)
-    ));
+        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
+        testing::Values(Sizes({k}), Sizes({n, k})),
+        testing::Values(std::nullopt)));
 
 NTIATE_TEST_SUITE_P(
     LinearWithBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-      tes
-        alues(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
-      tes
-        lues(Sizes({n, k})),
-      test
-        ues(Sizes({n}))
-    ));
-
-} mespace nvfuser
-            
\ No newline at end of file
+        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
+        testing::Values(Sizes({n, k})),
+        testing::Values(Sizes({n}))));
+
+} // namespace nvfuser

From 855091c0d76e6cea732285d7f073b08eec7199e5 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 05:21:40 +0000
Subject: [PATCH 08/30] comments

---
 csrc/ops/composite.cpp   | 25 +++++++++++++++----------
 csrc/root_domain_map.cpp | 11 +++++++++--
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index f36d57ba3c1..03b08a493c9 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -78,28 +78,33 @@ static TensorView* newForLinear(
 
   if (weight_domain.size() == 2) {
     // Add out_features to output domain.
+    std::vector<IterDomain*> out_features_ids = {weight_domain.at(0)};
     if (bias != nullptr) {
       auto bias_domain =
           TensorDomain::noReductions(bias->getMaybeRFactorDomain());
-      out_domain[ndims_out - 1] =
-          ops::newOutputIterDomain({weight_domain.at(0), bias_domain.at(0)});
-    } else {
-      out_domain[ndims_out - 1] =
-          ops::newOutputIterDomain({weight_domain.at(0)});
+      out_features_ids.emplace_back(bias_domain.at(0));
     }
+    out_domain[ndims_out - 1] = ops::newOutputIterDomain(out_features_ids);
   }
+}
 
-  TensorDomain* td = IrBuilder::create<TensorDomain>(
-      out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
+TensorDomain* td = IrBuilder::create<TensorDomain>(
+    out_domain,
+    TensorDomain::getContiguityFilledWith(out_domain, true));
 
-  return IrBuilder::create<TensorView>(td, input->dtype());
+return IrBuilder::create<TensorView>(td, input->dtype());
 }
 
 } // namespace
 
 TensorView* linear(TensorView* tv_a, TensorView* tv_b, TensorView* bias) {
-  NVF_CHECK(tv_a->nDims() >= 1);
-  NVF_CHECK(tv_b->nDims() == 1 || tv_b->nDims() == 2);
+  NVF_CHECK(tv_a->nDims() >= 1, "Input A must be atleast 1D.");
+  NVF_CHECK(
+      tv_b->nDims() == 1 || tv_b->nDims() == 2,
+      "Input B must be a 1D / 2D tensor.");
+  NVF_CHECK(
+      tv_b->nDims() == 1 && bias != nullptr,
+      "Input B must be a 2D tensor if bias is present, got 1D.")
 
   // For all other cases, create a new LinearOp
   TensorView* out = newForLinear(tv_a, tv_b, bias);
diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index b6ca1730f74..964580efb17 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -198,8 +198,6 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     return dom_map;
   }
 
-  // For LinearOp, all but the last dimension are the same shape as the input.
-  // The last dimension is out_features (if present).
   if (LinearOp* op = dynamic_cast<LinearOp*>(consumer_tv_->definition())) {
     auto out_size = consumer_root.size();
 
@@ -213,8 +211,15 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
       input_role = MatmulRole::INPUT_C;
     }
 
+    // LinearOp:
+    // inputs (INPUT_A) = {*, in_features}
+    // weight (INPUT_B) = {out_features, in_features} / {in_features}
+    // bias (INPUT_C) = {out_features} / {}
+    // output = {*, out_features} / {*}
+
     switch (input_role) {
       case MatmulRole::INPUT_A: {
+        // Linear output is same as input for all but the last dimension
         for (auto inx : c10::irange(producer_root.size() - 1)) {
           updatePairwiseRootDomainMap(
               producer_root.at(inx), consumer_root.at(inx));
@@ -228,6 +233,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
         }
       }
       case MatmulRole::INPUT_C: {
+        // The last dimension of LinearOp is out_features.
         updatePairwiseRootDomainMap(
             producer_root.at(0), consumer_root.at(out_size - 1));
         break;
@@ -1441,3 +1447,4 @@ const DisjointSets<const IterDomain*>& ExactRootDomainMap::getMappedSets()
 }
 
 } // namespace nvfuser
+      

From e4049356977b53aa77715323ed0dea6dd173c1b3 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 05:35:07 +0000
Subject: [PATCH 09/30] remove null chars

---
 csrc/root_domain_map.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index 964580efb17..0c9ff785109 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -1447,4 +1447,3 @@ const DisjointSets<const IterDomain*>& ExactRootDomainMap::getMappedSets()
 }
 
 } // namespace nvfuser
-      

From 669a3f006d48cce878512e9495610ddeae86253a Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 05:37:59 +0000
Subject: [PATCH 10/30] bump version

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index abd410582de..3a4036fb450 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.4
+0.2.5

From 3278f3b677920911b730b02a59e4d81d17370582 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 21:15:50 +0000
Subject: [PATCH 11/30] fix lintrunner formatting, refactor linear op mapping,
 add bcast dims to test

---
 csrc/ops/composite.cpp                    | 50 +++++++++-----------
 csrc/ops/utils.cpp                        | 56 +++++++++++++++++++++++
 csrc/ops/utils.h                          |  8 ++++
 csrc/root_domain_map.cpp                  | 32 ++++---------
 tests/cpp/test_matmul_aten_evaluation.cpp | 22 ++++-----
 5 files changed, 106 insertions(+), 62 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 03b08a493c9..b92e6c25f9b 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -57,42 +57,37 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) {
 namespace {
 
 static TensorView* newForLinear(
-    TensorView* input,
-    TensorView* weight,
+    TensorView* tv_a,
+    TensorView* tv_b,
     TensorView* bias) {
-  auto input_domain =
-      TensorDomain::noReductions(input->getMaybeRFactorDomain());
-  auto weight_domain =
-      TensorDomain::noReductions(weight->getMaybeRFactorDomain());
+  auto orig_domain_a =
+      TensorDomain::noReductions(tv_a->getMaybeRFactorDomain());
+  auto orig_domain_b =
+      TensorDomain::noReductions(tv_b->getMaybeRFactorDomain());
 
-  // Linear: inputs = {*, in_features}, weight = {out_features, in_features} /
+  // Linear: a = {*, in_features}, b = {out_features, in_features} /
   // {in_features} For the linear output, all but the last dimension are the
-  // same shape as the input. The last dimension is out_features (if present).
-  auto ndims_out = (input_domain.size() - 1) + (weight_domain.size() - 1);
+  // same shape as the first input. The last dimension is out_features (if present).
+  auto ndims_out = (orig_domain_a.size() - 1) + (orig_domain_b.size() - 1);
 
   std::vector<IterDomain*> out_domain(ndims_out, nullptr);
 
-  for (auto idx : c10::irange(input_domain.size() - 1)) {
-    out_domain[idx] = ops::newOutputIterDomain({input_domain.at(idx)});
+  const std::vector<IterDomain*>& mapping_a = ops::mapLinearOpIterDomains(
+      orig_domain_a, MatmulRole::INPUT_A, ndims_out);
+  const std::vector<IterDomain*>& mapping_b = ops::mapLinearOpIterDomains(
+      orig_domain_b, MatmulRole::INPUT_B, ndims_out);
+  std::vector<IterDomain*> mapping_bias (ndims_out, nullptr);
+  if (bias != nullptr){
+    auto bias_domain = TensorDomain::noReductions(bias->getMaybeRFactorDomain());
+    mapping_bias = ops::mapLinearOpIterDomains(bias_domain, MatmulRole::INPUT_C, ndims_out);
   }
 
-  if (weight_domain.size() == 2) {
-    // Add out_features to output domain.
-    std::vector<IterDomain*> out_features_ids = {weight_domain.at(0)};
-    if (bias != nullptr) {
-      auto bias_domain =
-          TensorDomain::noReductions(bias->getMaybeRFactorDomain());
-      out_features_ids.emplace_back(bias_domain.at(0));
-    }
-    out_domain[ndims_out - 1] = ops::newOutputIterDomain(out_features_ids);
-  }
-}
+  out_domain = ops::newOutputDomain({mapping_a, mapping_b, mapping_bias});
 
-TensorDomain* td = IrBuilder::create<TensorDomain>(
-    out_domain,
-    TensorDomain::getContiguityFilledWith(out_domain, true));
+  TensorDomain* td = IrBuilder::create<TensorDomain>(
+      out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
 
-return IrBuilder::create<TensorView>(td, input->dtype());
+  return IrBuilder::create<TensorView>(td, tv_a->dtype());
 }
 
 } // namespace
@@ -102,9 +97,6 @@ TensorView* linear(TensorView* tv_a, TensorView* tv_b, TensorView* bias) {
   NVF_CHECK(
       tv_b->nDims() == 1 || tv_b->nDims() == 2,
       "Input B must be a 1D / 2D tensor.");
-  NVF_CHECK(
-      tv_b->nDims() == 1 && bias != nullptr,
-      "Input B must be a 2D tensor if bias is present, got 1D.")
 
   // For all other cases, create a new LinearOp
   TensorView* out = newForLinear(tv_a, tv_b, bias);
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 16928f25002..b49d6f788cb 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -221,6 +221,43 @@ std::vector<IterDomain*> mapMatmulOpIterDomains(
   return mapping;
 }
 
+std::vector<IterDomain*> mapLinearOpIterDomains(
+    const std::vector<IterDomain*>& input_domain,
+    MatmulRole input_role,
+    size_t out_size) {
+
+  std::vector<IterDomain*> mapping(out_size, nullptr);
+  auto inp_size = input_domain.size();
+
+  // Input A: {*, M, K}
+  // Input B: {*, N, K} / {K}
+  // Bias: {N} / {}
+
+  switch (input_role) {
+      case MatmulRole::INPUT_A: {
+        // Linear output is same as input for all but the last dimension
+        for (auto inx : c10::irange(inp_size - 1)) {
+          mapping[inx] = input_domain[inx];
+        }
+        break;
+      }
+      case MatmulRole::INPUT_B: {
+        if (inp_size == 1) {
+          // out_features is not present, no mapping required.
+          break;
+        }
+      }
+      case MatmulRole::INPUT_C: {
+        // The last dimension of LinearOp is out_features.
+        mapping[out_size - 1] = input_domain[0];
+        break;
+      }
+      default:
+        NVF_ERROR("Unexpected input type.");
+  }
+  return mapping;
+}
+
 // Adding these pragmas since gcc-12.2.1
 // incorrectly reports a warning with the use of evaluate
 #if defined(__GNUC__) && !defined(__clang__)
@@ -311,6 +348,25 @@ IterDomain* newOutputIterDomain(
 #pragma GCC diagnostic pop
 #endif
 
+std::vector<IterDomain*> newOutputDomain(const std::vector<std::vector<IterDomain*>>& input_ids) {
+  NVF_CHECK(
+      !input_ids.empty(),
+      "Tried to create new output Tensorview but received empty list.");
+
+  std::vector<IterDomain*> out_domain(input_ids.front().size(), nullptr);
+
+  for (const auto dim_i : c10::irange(out_domain.size())) {
+    std::vector<IterDomain*> ids_i;
+    ids_i.reserve(input_ids.size());
+    for (auto ids : input_ids) {
+      if (ids[dim_i] != nullptr)
+        ids_i.emplace_back(ids[dim_i]);
+    }
+    out_domain[dim_i] = newOutputIterDomain(ids_i);
+  }
+  return out_domain;
+}
+
 std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index 47ac94b7d04..6e4340b2b54 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -51,6 +51,11 @@ std::vector<IterDomain*> mapMatmulOpIterDomains(
     MatmulRole input_role,
     size_t out_size);
 
+std::vector<IterDomain*> mapLinearOpIterDomains(
+    const std::vector<IterDomain*>& input_domain,
+    MatmulRole input_role,
+    size_t out_size);
+    
 // Takes a vector of aligned input iterdomains to create the output iterdomain.
 // This is used if the input iterdomains are not trivially mapped to the output
 // iterdomains. For eg: MatmulOp. If given, the forced_iter_type argument will
@@ -60,6 +65,9 @@ IterDomain* newOutputIterDomain(
     const std::vector<IterDomain*>& ids,
     const std::optional<IterType> force_iter_type = std::nullopt);
 
+// Takes multiple vectors of input iterdomains and assumes they are aligned to create the output tensorview.
+std::vector<IterDomain*> newOutputDomain(const std::vector<std::vector<IterDomain*>>& input_ids);
+
 // Takes a vector of tensorviews and assumes they are all aligned to create the
 // output tensorview. For eg: BinaryOp.
 std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);
diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index 0c9ff785109..980811ff05f 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -217,30 +217,18 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     // bias (INPUT_C) = {out_features} / {}
     // output = {*, out_features} / {*}
 
-    switch (input_role) {
-      case MatmulRole::INPUT_A: {
-        // Linear output is same as input for all but the last dimension
-        for (auto inx : c10::irange(producer_root.size() - 1)) {
-          updatePairwiseRootDomainMap(
-              producer_root.at(inx), consumer_root.at(inx));
-        }
-        break;
-      }
-      case MatmulRole::INPUT_B: {
-        if (producer_root.size() == 1) {
-          // out_features is not present, no mapping required.
-          break;
-        }
-      }
-      case MatmulRole::INPUT_C: {
-        // The last dimension of LinearOp is out_features.
-        updatePairwiseRootDomainMap(
-            producer_root.at(0), consumer_root.at(out_size - 1));
-        break;
+    const std::vector<IterDomain*>& aligned_producer_ids =
+        ops::mapLinearOpIterDomains(producer_root, input_role, out_size);
+
+    for (auto inx : c10::irange(out_size)) {
+      IterDomain* producer_id = aligned_producer_ids.at(inx);
+      IterDomain* consumer_id = consumer_root.at(inx);
+      if (producer_id == nullptr) {
+        continue;
       }
-      default:
-        NVF_ERROR("Unexpected input type.");
+      updatePairwiseRootDomainMap(producer_id, consumer_id);
     }
+
     return dom_map;
   }
 
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index c7254461d8d..e453f0f2767 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -35,12 +35,12 @@ class MatmulATenEvaluationTest : public NVFuserTest {
 using Sizes = std::vector<int64_t>;
 using MatmulNodeParamType = std::tuple<Sizes, Sizes>;
 
-class ATenNodesParametrizedTest
+class MatmulNodeParametrizedTest
     : public NVFuserFixtureParamTest<MatmulNodeParamType> {
  protected:
   // Allocation order set by the pass breaks matmul tests
   // see issue https://github.com/NVIDIA/Fuser/issues/1810
-  ATenNodesParametrizedTest() : optimization_guard_(false) {}
+  MatmulNodeParametrizedTest() : optimization_guard_(false) {}
 
  private:
   preseg_passes::OptimizationPassGuard<preseg_passes::AllocationDomainPass>
@@ -539,7 +539,7 @@ TEST_P(ATenNodesParametrizedTest, MatmulNodeConcrete) {
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 
-TEST_P(ATenNodesParametrizedTest, MatmulNodeSymbolic) {
+TEST_P(MatmulNodeParametrizedTest, MatmulNodeSymbolic) {
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -596,7 +596,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
 
   FusionExecutorCache fec(std::move(fusion));
 
-  d::vector<at::Tensor> out = {};
+  std::vector<at::Tensor> out = {};
   if (bias_shape.has_value()) {
     out = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
@@ -646,7 +646,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
 
   std::vector<at::Tensor> out = {};
   if (bias_shape.has_value()) {
-    ou t = fec.runFusionWithInputs({t0, t1, bias_opt});
+    out = fec.runFusionWithInputs({t0, t1, bias_opt});
   } else {
     out = fec.runFusionWithInputs({t0, t1});
   }
@@ -657,7 +657,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
 
-  EXPECT(at::allclose(out[0], out_ref));
+  EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 
 constexpr int64_t b = 128, m = 64, k = 32, n = 16;
@@ -665,7 +665,7 @@ constexpr int64_t b = 128, m = 64, k = 32, n = 16;
 // Parametrize a_shape and b_shape
 INSTANTIATE_TEST_SUITE_P(
     ,
-    ATenNodesParametrizedTest,
+    MatmulNodeParametrizedTest,
     testing::Combine(
         testing::Values(
             Sizes({k}),
@@ -700,15 +700,15 @@ INSTANTIATE_TEST_SUITE_P(
     LinearWithoutBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
-        testing::Values(Sizes({k}), Sizes({n, k})),
+        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k}), Sizes({1, k}), Sizes({b, 1, k})),
+        testing::Values(Sizes({k}), Sizes({n, k}), Sizes({1, k})),
         testing::Values(std::nullopt)));
 
-NTIATE_TEST_SUITE_P(
+INSTANTIATE_TEST_SUITE_P(
     LinearWithBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k})),
+        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k}), Sizes({1, k}), Sizes({b, 1, k})),
         testing::Values(Sizes({n, k})),
         testing::Values(Sizes({n}))));
 

From c1b7a924a96e4c6adb6cefd69785e808b82bd356 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 21:20:21 +0000
Subject: [PATCH 12/30] util function

---
 csrc/ops/composite.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index b92e6c25f9b..9ee77fe3f3b 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -70,8 +70,6 @@ static TensorView* newForLinear(
   // same shape as the first input. The last dimension is out_features (if present).
   auto ndims_out = (orig_domain_a.size() - 1) + (orig_domain_b.size() - 1);
 
-  std::vector<IterDomain*> out_domain(ndims_out, nullptr);
-
   const std::vector<IterDomain*>& mapping_a = ops::mapLinearOpIterDomains(
       orig_domain_a, MatmulRole::INPUT_A, ndims_out);
   const std::vector<IterDomain*>& mapping_b = ops::mapLinearOpIterDomains(
@@ -82,7 +80,7 @@ static TensorView* newForLinear(
     mapping_bias = ops::mapLinearOpIterDomains(bias_domain, MatmulRole::INPUT_C, ndims_out);
   }
 
-  out_domain = ops::newOutputDomain({mapping_a, mapping_b, mapping_bias});
+  std::vector<IterDomain*> out_domain = ops::newOutputDomain({mapping_a, mapping_b, mapping_bias});
 
   TensorDomain* td = IrBuilder::create<TensorDomain>(
       out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
@@ -301,8 +299,6 @@ static TensorView* newForMatmul(TensorView* tv_a, TensorView* tv_b) {
     ndims_out = std::max(ndims_a, ndims_b);
   }
 
-  std::vector<IterDomain*> out_domain(ndims_out, nullptr);
-
   const std::vector<IterDomain*>& mapping_a = ops::mapMatmulOpIterDomains(
       orig_domain_a, MatmulRole::INPUT_A, ndims_out);
   const std::vector<IterDomain*>& mapping_b = ops::mapMatmulOpIterDomains(

From c1111482ef427a1171e474eb8fdb74349c7b0646 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 21:45:02 +0000
Subject: [PATCH 13/30] bcast test, bias checks

---
 csrc/ops/composite.cpp                  | 22 ++++++++++++++++++----
 csrc/ops/utils.cpp                      |  3 ++-
 tests/python/pytest_input_generators.py |  2 +-
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 9ee77fe3f3b..0f7021e645c 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -91,11 +91,25 @@ static TensorView* newForLinear(
 } // namespace
 
 TensorView* linear(TensorView* tv_a, TensorView* tv_b, TensorView* bias) {
-  NVF_CHECK(tv_a->nDims() >= 1, "Input A must be atleast 1D.");
-  NVF_CHECK(
-      tv_b->nDims() == 1 || tv_b->nDims() == 2,
-      "Input B must be a 1D / 2D tensor.");
+  auto ndims_a = TensorDomain::noReductions(tv_a->getMaybeRFactorDomain()).size();
+  NVF_CHECK(ndims_a > 0, "Input A must be atleast 1D.");
+  
+  auto ndims_b = TensorDomain::noReductions(tv_b->getMaybeRFactorDomain()).size();
+  NVF_CHECK(ndims_b == 1 || ndims_b == 2, "Input B must be a 1D / 2D tensor.");
+
+  NVF_CHECK(ndims_b == 2 || bias == nullptr, "Expected B to be a 2D matrix if bias is given, got 1D.")
 
+  NVF_CHECK(
+      tv_a->dtype() == tv_b->dtype(),
+      "Expected A and B dtypes to have the same dtype, got: ",
+      tv_a->dtype(),
+      " and ",
+      tv_b->dtype());
+  
+  NVF_CHECK(
+    bias == nullptr || bias->dtype() == tv_a->dtype(),
+    "Expected bias to have the same dtype as A and B, got: ", bias->dtype(), " and ", tv_b->dtype()
+  );
   // For all other cases, create a new LinearOp
   TensorView* out = newForLinear(tv_a, tv_b, bias);
   IrBuilder::create<LinearOp>(out, tv_a, tv_b, bias);
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index b49d6f788cb..a88ffb130b3 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -359,8 +359,9 @@ std::vector<IterDomain*> newOutputDomain(const std::vector<std::vector<IterDomai
     std::vector<IterDomain*> ids_i;
     ids_i.reserve(input_ids.size());
     for (auto ids : input_ids) {
-      if (ids[dim_i] != nullptr)
+      if (ids[dim_i] != nullptr){
         ids_i.emplace_back(ids[dim_i]);
+      }
     }
     out_domain[dim_i] = newOutputIterDomain(ids_i);
   }
diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index aa29430c751..24a036ea768 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1530,7 +1530,7 @@ def linear_input_generator(
     K = 32
 
     # Cases without bias
-    shapes_input = ((K), (M, K), (B, M, K))
+    shapes_input = ((K), (M, K), (B, M, K), (B, 1, M, K))
     shapes_weight = ((K), (N, K))
     for shape_input, shape_weight in itertools.product(shapes_input, shapes_weight):
         yield SampleInput(make_arg(shape_input), make_arg(shape_weight))

From 229fbf6627250e860637fc71515f2ca59a950ce9 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 23:13:49 +0000
Subject: [PATCH 14/30] 0-D bias, comments

---
 csrc/ir/internal_nodes.h                  |  3 ++-
 csrc/ops/composite.cpp                    |  1 +
 csrc/ops/utils.cpp                        | 14 ++++++++------
 tests/cpp/test_matmul_aten_evaluation.cpp |  6 +++---
 tests/python/test_python_frontend.py      |  3 +--
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 38892ef581b..3dc644e10af 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2288,7 +2288,8 @@ class MatmulOp : public Expr {
       const std::vector<PolymorphicValue>& inputs) const override;
 };
 
-//! Linear Operator to be expression evaluated without decomposition.
+// Linear Operator to be expression evaluated without decomposition.
+// This node has the same functionality as F.linear (https://pytorch.org/docs/stable/generated/torch.nn.functional.linear.html#torch.nn.functional.linear)
 class LinearOp : public Expr {
  public:
   using Expr::Expr;
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 0f7021e645c..81ff3073d77 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -97,6 +97,7 @@ TensorView* linear(TensorView* tv_a, TensorView* tv_b, TensorView* bias) {
   auto ndims_b = TensorDomain::noReductions(tv_b->getMaybeRFactorDomain()).size();
   NVF_CHECK(ndims_b == 1 || ndims_b == 2, "Input B must be a 1D / 2D tensor.");
 
+  // Note: This constraint is not documented but F.linear errors out if bias is given with 1D weights.
   NVF_CHECK(ndims_b == 2 || bias == nullptr, "Expected B to be a 2D matrix if bias is given, got 1D.")
 
   NVF_CHECK(
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index a88ffb130b3..2aa51a28c9d 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -232,7 +232,6 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
   // Input A: {*, M, K}
   // Input B: {*, N, K} / {K}
   // Bias: {N} / {}
-
   switch (input_role) {
       case MatmulRole::INPUT_A: {
         // Linear output is same as input for all but the last dimension
@@ -242,14 +241,17 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
         break;
       }
       case MatmulRole::INPUT_B: {
-        if (inp_size == 1) {
-          // out_features is not present, no mapping required.
-          break;
+        if (inp_size > 1) {
+          // Weight is of shape {out_features, in_features}
+          mapping[out_size - 1] = input_domain[0];
         }
+        break;
       }
       case MatmulRole::INPUT_C: {
-        // The last dimension of LinearOp is out_features.
-        mapping[out_size - 1] = input_domain[0];
+        if (inp_size > 0){
+          // Bias is 1D tensor of shape {out_features}
+          mapping[out_size - 1] = input_domain[0];
+        }
         break;
       }
       default:
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index e453f0f2767..7ef14e4ffbf 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -590,7 +590,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
   std::optional<at::Tensor> bias_opt = std::nullopt;
   if (bias_shape.has_value()) {
-    bias_opt = at::randn(*bias_shape, at::kHalf).cuda();
+    bias_opt = bias_shape.value().empty() ? at::scalar_tensor(3.14).to(at::kHalf).cuda(): at::randn(*bias_shape, at::kHalf).cuda();
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
@@ -638,7 +638,7 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
   std::optional<at::Tensor> bias_opt = std::nullopt;
   if (bias_shape.has_value()) {
-    bias_opt = at::randn(*bias_shape, at::kHalf).cuda();
+    bias_opt = bias_shape.value().empty() ? at::scalar_tensor(3.14).to(at::kHalf).cuda() : at::randn(*bias_shape, at::kHalf).cuda();
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
@@ -710,6 +710,6 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k}), Sizes({1, k}), Sizes({b, 1, k})),
         testing::Values(Sizes({n, k})),
-        testing::Values(Sizes({n}))));
+        testing::Values(Sizes({}), Sizes({n}))));
 
 } // namespace nvfuser
diff --git a/tests/python/test_python_frontend.py b/tests/python/test_python_frontend.py
index 41664ccb0c1..00b78c89757 100644
--- a/tests/python/test_python_frontend.py
+++ b/tests/python/test_python_frontend.py
@@ -2408,7 +2408,6 @@ def test_linear(self):
         k = 8
         bias0d = torch.tensor(3.14, device="cuda", dtype=torch.float16)
         bias1d = torch.randn(n, device="cuda", dtype=torch.float16)
-        bias2d = torch.rand(m, n, device="cuda", dtype=torch.float16)
 
         inputs_mk_nk = [
             torch.randn(m, k, device="cuda", dtype=torch.float16),
@@ -2446,7 +2445,7 @@ def fusion_func(
             fd.add_output(t_out)
 
         in_tensors = [inputs_mk_nk, inputs_mk_kn, inputs_km_nk, inputs_km_kn]
-        use_bias = [None, bias0d, bias1d, bias2d]
+        use_bias = [None, bias0d, bias1d]
         for [inp, wt], use_bias in list(itertools.product(in_tensors, use_bias)):
             with self.subTest(inp=inp, wt=wt, use_bias=use_bias):
                 input_tensors = (

From 66ee73375c06126e93c4309edd4fcabd176dc578 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 23:24:10 +0000
Subject: [PATCH 15/30] comments

---
 csrc/ops/composite.h | 6 ++----
 csrc/ops/utils.h     | 5 ++++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/csrc/ops/composite.h b/csrc/ops/composite.h
index fa617e75154..13ffe8f8884 100644
--- a/csrc/ops/composite.h
+++ b/csrc/ops/composite.h
@@ -47,10 +47,8 @@ NVF_API LstmResult lstm(
     TensorView* cell_x,
     TensorView* out_x);
 
-// Linear functions which takes in two tensors of shapes A[M,K] and
-// B[N,K]. Takes in a options bias of shape [N] and performs
-// out = A * B_Transpose + bias. The output dtype matches the dtype
-// ofthe inputs which should match.
+// Linear functions which takes in two tensors of shapes A[* , in_features], B[out_features, in_features] / [in_features] and an optional bias of shape [out_features] or 0D scalar.
+// Bias can only be given if B is a 2-D tensor. 
 TensorView* linear(TensorView* a, TensorView* b, TensorView* bias);
 // This is an implementation detail to reflect when linear is called
 // without a bias. This calls the above function. We use this function
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index 6e4340b2b54..f6750ef9f5e 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -51,11 +51,14 @@ std::vector<IterDomain*> mapMatmulOpIterDomains(
     MatmulRole input_role,
     size_t out_size);
 
+// For LinearOp, the output is the same as the first input (A[*, in_features])for all but the last dimension.
+// If the second input is 2D (B[out_features, in_features]), the last dimension of output is out_features.
+// If bias is 1D (bias[out_features]) it maps to the last dimension of the output.
 std::vector<IterDomain*> mapLinearOpIterDomains(
     const std::vector<IterDomain*>& input_domain,
     MatmulRole input_role,
     size_t out_size);
-    
+
 // Takes a vector of aligned input iterdomains to create the output iterdomain.
 // This is used if the input iterdomains are not trivially mapped to the output
 // iterdomains. For eg: MatmulOp. If given, the forced_iter_type argument will

From a70e0a08742651ee84384a15fdb25c78ae100e72 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 23:37:22 +0000
Subject: [PATCH 16/30] update tests

---
 tests/cpp/test_matmul_aten_evaluation.cpp | 8 ++++++++
 tests/python/pytest_input_generators.py   | 8 ++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 7ef14e4ffbf..29c8fd32d7e 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -712,4 +712,12 @@ INSTANTIATE_TEST_SUITE_P(
         testing::Values(Sizes({n, k})),
         testing::Values(Sizes({}), Sizes({n}))));
 
+INSTANTIATE_TEST_SUITE_P(
+    LinearReductionAxisIsOne,
+    LinearNodeParametrizedTest,
+    testing::Combine(
+        testing::Values(Sizes({m, 1}), Sizes({b, m, 1}))),
+        testing::Values(Sizes({n, 1})),
+        testing::Values(Sizes({}), Sizes({n})));
+
 } // namespace nvfuser
diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index 24a036ea768..83f2cb35efe 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1530,15 +1530,15 @@ def linear_input_generator(
     K = 32
 
     # Cases without bias
-    shapes_input = ((K), (M, K), (B, M, K), (B, 1, M, K))
-    shapes_weight = ((K), (N, K))
+    shapes_input = ((K), (M, K))
+    shapes_weight = ((K), (N, K), (1, K))
     for shape_input, shape_weight in itertools.product(shapes_input, shapes_weight):
         yield SampleInput(make_arg(shape_input), make_arg(shape_weight))
 
     # Cases with bias
     shape_weight = (N, K)
-    shape_bias = (N,)
-    for shape_input in shapes_input:
+    shapes_bias = (())
+    for shape_input, shape_bias in itertools.product(shapes_input, shapes_bias):
         yield SampleInput(
             make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)
         )

From dc41b593ccc6723cb4cb09b1f72988a1d5578297 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Tue, 14 May 2024 23:40:54 +0000
Subject: [PATCH 17/30] update tests

---
 tests/python/pytest_input_generators.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index 83f2cb35efe..cf76e12dfa1 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1530,14 +1530,14 @@ def linear_input_generator(
     K = 32
 
     # Cases without bias
-    shapes_input = ((K), (M, K))
+    shapes_input = ((K), (M, K), (B, M, K), (B, 1, M, K))
     shapes_weight = ((K), (N, K), (1, K))
     for shape_input, shape_weight in itertools.product(shapes_input, shapes_weight):
         yield SampleInput(make_arg(shape_input), make_arg(shape_weight))
 
     # Cases with bias
     shape_weight = (N, K)
-    shapes_bias = (())
+    shapes_bias = ((), (N,))
     for shape_input, shape_bias in itertools.product(shapes_input, shapes_bias):
         yield SampleInput(
             make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)

From eeb1a9d72554eed00df2dacd5f5182e5ad462cc7 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Wed, 15 May 2024 00:42:27 +0000
Subject: [PATCH 18/30] clangtidy

---
 csrc/root_domain_map.cpp                  | 8 +++++---
 tests/cpp/test_matmul_aten_evaluation.cpp | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index 980811ff05f..0f68bf88270 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -202,13 +202,15 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     auto out_size = consumer_root.size();
 
     // Check if the producer is A, B or bias.
-    MatmulRole input_role;
+    std::optional<MatmulRole> input_role = std::nullopt;
     if (producer->sameAs(op->inA()->as<TensorView>()->domain())) {
       input_role = MatmulRole::INPUT_A;
     } else if (producer->sameAs(op->inB()->as<TensorView>()->domain())) {
       input_role = MatmulRole::INPUT_B;
-    } else {
+    } else if (producer->sameAs(op->bias()->as<TensorView>()->domain())){
       input_role = MatmulRole::INPUT_C;
+    } else {
+      NVF_ERROR(false, "Producer did not match any LinearOp input.")
     }
 
     // LinearOp:
@@ -218,7 +220,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     // output = {*, out_features} / {*}
 
     const std::vector<IterDomain*>& aligned_producer_ids =
-        ops::mapLinearOpIterDomains(producer_root, input_role, out_size);
+        ops::mapLinearOpIterDomains(producer_root, input_role.value(), out_size);
 
     for (auto inx : c10::irange(out_size)) {
       IterDomain* producer_id = aligned_producer_ids.at(inx);
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 29c8fd32d7e..2105a50dd74 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -716,8 +716,8 @@ INSTANTIATE_TEST_SUITE_P(
     LinearReductionAxisIsOne,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({m, 1}), Sizes({b, m, 1}))),
+        testing::Values(Sizes({m, 1}), Sizes({b, m, 1})),
         testing::Values(Sizes({n, 1})),
-        testing::Values(Sizes({}), Sizes({n})));
+        testing::Values(Sizes({}), Sizes({n}))));
 
 } // namespace nvfuser

From f338a4b036f4549e77e292bf0b6b6059be062741 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 04:18:36 +0000
Subject: [PATCH 19/30] update mapping to include K

---
 csrc/device_lower/utils.cpp | 1 +
 csrc/ops/composite.cpp      | 6 +++---
 csrc/ops/utils.cpp          | 9 +++++----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index 99b22087589..f9f9e848fff 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -151,6 +151,7 @@ bool isTvOp(const Expr* expr) {
           LoadStoreOp,
           MatmulOp,
           MmaOp,
+          LinearOp,
           BroadcastOp,
           SqueezeOp,
           ExpandOp,
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 81ff3073d77..8d2a7e84d5f 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -66,9 +66,9 @@ static TensorView* newForLinear(
       TensorDomain::noReductions(tv_b->getMaybeRFactorDomain());
 
   // Linear: a = {*, in_features}, b = {out_features, in_features} /
-  // {in_features} For the linear output, all but the last dimension are the
-  // same shape as the first input. The last dimension is out_features (if present).
-  auto ndims_out = (orig_domain_a.size() - 1) + (orig_domain_b.size() - 1);
+  // {in_features}.The linear output is {*, (out_features), rK}.
+  // The first out_size -2 dimensions are as the first input, followed by out_features (if present) and an additional reduction axis K.
+  auto ndims_out = (orig_domain_a.size() - 1) + orig_domain_b.size();
 
   const std::vector<IterDomain*>& mapping_a = ops::mapLinearOpIterDomains(
       orig_domain_a, MatmulRole::INPUT_A, ndims_out);
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 2aa51a28c9d..32d47f7c60f 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -238,19 +238,20 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
         for (auto inx : c10::irange(inp_size - 1)) {
           mapping[inx] = input_domain[inx];
         }
+        mapping[out_size - 1] = input_domain.back();
         break;
       }
       case MatmulRole::INPUT_B: {
-        if (inp_size > 1) {
-          // Weight is of shape {out_features, in_features}
-          mapping[out_size - 1] = input_domain[0];
+        for (auto inx: c10::irange(inp_size - 1)) {
+          // Map N, K to the last two positions of the output.
+          mapping[out_size - 1 - inx] = input_domain[inp_size - 1 - inx];
         }
         break;
       }
       case MatmulRole::INPUT_C: {
         if (inp_size > 0){
           // Bias is 1D tensor of shape {out_features}
-          mapping[out_size - 1] = input_domain[0];
+          mapping[out_size - 2] = input_domain[0];
         }
         break;
       }

From cb6ebffa93704e0303c7d14d26d3d381854ca454 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 04:59:37 +0000
Subject: [PATCH 20/30] add reduction dim

---
 csrc/ops/composite.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 8d2a7e84d5f..405b18b9af8 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -81,6 +81,8 @@ static TensorView* newForLinear(
   }
 
   std::vector<IterDomain*> out_domain = ops::newOutputDomain({mapping_a, mapping_b, mapping_bias});
+  // Specify the iterdomain for K as reduction
+  out_domain[ndims_out - 1] = IterDomainBuilder(out_domain.back()).iter_type(IterType::Reduction).build(); 
 
   TensorDomain* td = IrBuilder::create<TensorDomain>(
       out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
@@ -314,6 +316,8 @@ static TensorView* newForMatmul(TensorView* tv_a, TensorView* tv_b) {
     ndims_out = std::max(ndims_a, ndims_b);
   }
 
+  std::vector<IterDomain*> out_domain(ndims_out, nullptr);
+  
   const std::vector<IterDomain*>& mapping_a = ops::mapMatmulOpIterDomains(
       orig_domain_a, MatmulRole::INPUT_A, ndims_out);
   const std::vector<IterDomain*>& mapping_b = ops::mapMatmulOpIterDomains(

From 08b52d01709bb2610380780c2cc3beb64b7d2ce8 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 05:48:25 +0000
Subject: [PATCH 21/30] error generator

---
 tests/python/pytest_input_generators.py | 28 +++++++++++++++++++++++++
 tests/python/pytest_opinfos.py          |  2 ++
 2 files changed, 30 insertions(+)

diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index cf76e12dfa1..d7cfb152f37 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1542,3 +1542,31 @@ def linear_input_generator(
         yield SampleInput(
             make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)
         )
+
+def linear_error_generator(op, dtype=torch.float32, requires_grad: bool = False, **kwargs):
+    make_arg = partial(
+        make_tensor, device="cuda", dtype=dtype, requires_grad=requires_grad
+    )
+    # shapes, dim, exception type, exception string
+    M = 512
+    N = 256
+    K = 32
+
+    bias_with_1dweight = (
+        ((M, K), (K), (N)),
+        RuntimeError,
+        "Expected B to be a 2D matrix if bias is given, got 1D.",
+    )
+
+    # mismatched_bias_extent = (
+    #     ((M, K), (1, K), (N)),
+    #     RuntimeError,
+    #     f"The expanded size of the tensor (1) must match the existing size ({N}) at non-singleton dimension 1.  Target sizes: [{M}, 1].  Tensor sizes: [{N}]",
+    # )
+
+    for input_shapes, ex_type, ex_str in [bias_with_1dweight]:
+        shape_input, shape_weight, shape_bias = input_shapes
+        print (input_shapes)
+        yield SampleInput(
+            make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)
+        ), ex_type, ex_str
\ No newline at end of file
diff --git a/tests/python/pytest_opinfos.py b/tests/python/pytest_opinfos.py
index 5d69f57891a..480810d6927 100644
--- a/tests/python/pytest_opinfos.py
+++ b/tests/python/pytest_opinfos.py
@@ -50,6 +50,7 @@
     where_error_generator,
     matmul_input_generator,
     linear_input_generator,
+    linear_error_generator,
 )
 from pytest_utils import (
     bool_int_dtypes,
@@ -1133,6 +1134,7 @@ def torch_reshape_sym_fn(input_tensor, output_shaped_tensor):
         else (torch.float16,)
     ),
     sample_input_generator=linear_input_generator,
+    error_input_generator=linear_error_generator,
     reference=torch.nn.functional.linear,
 )
 linear_ops.append(linear_opinfo)

From 4e6ceaba51b5cf2a4a2676913af3bcf81330b3ca Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 19:28:30 +0000
Subject: [PATCH 22/30] rename

---
 csrc/ops/composite.cpp                    | 28 +++++++++++------------
 csrc/ops/composite.h                      |  8 +++----
 csrc/ops/utils.cpp                        | 12 +++++++++-
 tests/cpp/test_matmul_aten_evaluation.cpp |  4 ++--
 4 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 405b18b9af8..b5ddd5c0ce3 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -92,30 +92,30 @@ static TensorView* newForLinear(
 
 } // namespace
 
-TensorView* linear(TensorView* tv_a, TensorView* tv_b, TensorView* bias) {
-  auto ndims_a = TensorDomain::noReductions(tv_a->getMaybeRFactorDomain()).size();
-  NVF_CHECK(ndims_a > 0, "Input A must be atleast 1D.");
+TensorView* linear(TensorView* input, TensorView* weight, TensorView* bias) {
+  auto input_ndims = TensorDomain::noReductions(input->getMaybeRFactorDomain()).size();
+  NVF_CHECK(input_ndims > 0, "Input A must be atleast 1D.");
   
-  auto ndims_b = TensorDomain::noReductions(tv_b->getMaybeRFactorDomain()).size();
-  NVF_CHECK(ndims_b == 1 || ndims_b == 2, "Input B must be a 1D / 2D tensor.");
+  auto weight_ndims = TensorDomain::noReductions(weight->getMaybeRFactorDomain()).size();
+  NVF_CHECK(weight_ndims == 1 || weight_ndims == 2, "Input B must be a 1D / 2D tensor.");
 
   // Note: This constraint is not documented but F.linear errors out if bias is given with 1D weights.
-  NVF_CHECK(ndims_b == 2 || bias == nullptr, "Expected B to be a 2D matrix if bias is given, got 1D.")
+  NVF_CHECK(weight_ndims == 2 || bias == nullptr, "Expected B to be a 2D matrix if bias is given, got 1D.")
 
   NVF_CHECK(
-      tv_a->dtype() == tv_b->dtype(),
-      "Expected A and B dtypes to have the same dtype, got: ",
-      tv_a->dtype(),
+      input->dtype() == weight->dtype(),
+      "Expected input and weight dtypes to have the same dtype, got: ",
+      input->dtype(),
       " and ",
-      tv_b->dtype());
+      weight->dtype());
   
   NVF_CHECK(
-    bias == nullptr || bias->dtype() == tv_a->dtype(),
-    "Expected bias to have the same dtype as A and B, got: ", bias->dtype(), " and ", tv_b->dtype()
+    bias == nullptr || bias->dtype() == input->dtype(),
+    "Expected bias to have the same dtype as A and B, got: ", bias->dtype(), " and ", input->dtype()
   );
   // For all other cases, create a new LinearOp
-  TensorView* out = newForLinear(tv_a, tv_b, bias);
-  IrBuilder::create<LinearOp>(out, tv_a, tv_b, bias);
+  TensorView* out = newForLinear(input, weight, bias);
+  IrBuilder::create<LinearOp>(out, input, weight, bias);
   return out;
 }
 
diff --git a/csrc/ops/composite.h b/csrc/ops/composite.h
index 13ffe8f8884..3cd38a5d5da 100644
--- a/csrc/ops/composite.h
+++ b/csrc/ops/composite.h
@@ -47,15 +47,15 @@ NVF_API LstmResult lstm(
     TensorView* cell_x,
     TensorView* out_x);
 
-// Linear functions which takes in two tensors of shapes A[* , in_features], B[out_features, in_features] / [in_features] and an optional bias of shape [out_features] or 0D scalar.
-// Bias can only be given if B is a 2-D tensor. 
-TensorView* linear(TensorView* a, TensorView* b, TensorView* bias);
+// Linear functions which takes in two tensors of shapes input[* , in_features], weight[out_features, in_features] / [in_features] and an optional bias of shape [out_features] or 0D scalar.
+// Bias can only be given if weight is a 2-D tensor. 
+TensorView* linear(TensorView* input, TensorView* weight, TensorView* bias);
 // This is an implementation detail to reflect when linear is called
 // without a bias. This calls the above function. We use this function
 // since it simplifies creating a Python API which takes optional arguments.
 // Other options include using lambdas or creating a new RecordFunctor for
 // Linear.
-TensorView* linear(TensorView* a, TensorView* b);
+TensorView* linear(TensorView* input, TensorView* weight);
 
 NVF_API TensorView* sign(TensorView* x);
 NVF_API Val* sign(Val* x);
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 32d47f7c60f..532a122e0ff 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -268,7 +268,7 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
 #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
 #endif
 IterDomain* newOutputIterDomain(
-    const std::vector<IterDomain*>& ids,
+    const std::vector<IterDomain*>& input_ids,
     const std::optional<IterType> force_iter_type) {
   // For the start and stop offsets, take the maximum of input axes.
   // For now, the offsets of both start and stop are always integer
@@ -282,6 +282,16 @@ IterDomain* newOutputIterDomain(
   Val* expanded_extent_val = nullptr;
   std::optional<IterType> iter_type = std::nullopt;
 
+  std::vector<IterDomain*> ids;
+  ids.reserve(input_ids.size());
+
+  // Filter out any nullptrs
+  std::copy_if(
+    input_ids.begin(),
+    input_ids.end(),
+    std::back_inserter(ids),
+    [](IterDomain* id) { return id!=nullptr;});
+
   for (auto id : ids) {
     if (id->isBroadcast()) {
       if (id->hasExpandedExtent()) {
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 2105a50dd74..21da8f9efa7 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -513,7 +513,7 @@ void checkMatmulOpIdMapping(
   }
 }
 
-TEST_P(ATenNodesParametrizedTest, MatmulNodeConcrete) {
+TEST_P(MatmulNodeParametrizedTest, MatmulNodeConcrete) {
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -682,7 +682,7 @@ INSTANTIATE_TEST_SUITE_P(
 // Test case where K=1
 INSTANTIATE_TEST_SUITE_P(
     ReductionAxisIsOne,
-    ATenNodesParametrizedTest,
+    MatmulNodeParametrizedTest,
     testing::Combine(
         testing::Values(
             Sizes({1}),

From 9e9060b71e3afa14edc4a7842628cc2272cb9bcb Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 20:42:59 +0000
Subject: [PATCH 23/30] filter nullptr

---
 csrc/ops/composite.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index b5ddd5c0ce3..16dec3df970 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -81,8 +81,14 @@ static TensorView* newForLinear(
   }
 
   std::vector<IterDomain*> out_domain = ops::newOutputDomain({mapping_a, mapping_b, mapping_bias});
+
+  for (auto idx : c10::irange(ndims_out - 1)){
+    out_domain[idx] = ops::newOutputIterDomain({mapping_a.at(idx), mapping_b.at(idx), mapping_bias.at(idx)});
+  }
   // Specify the iterdomain for K as reduction
-  out_domain[ndims_out - 1] = IterDomainBuilder(out_domain.back()).iter_type(IterType::Reduction).build(); 
+  out_domain[ndims_out - 1] = ops::newOutputIterDomain(
+      {mapping_a.back(), mapping_b.back()},
+      /*force_iter_type=*/IterType::Reduction);
 
   TensorDomain* td = IrBuilder::create<TensorDomain>(
       out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
@@ -323,16 +329,8 @@ static TensorView* newForMatmul(TensorView* tv_a, TensorView* tv_b) {
   const std::vector<IterDomain*>& mapping_b = ops::mapMatmulOpIterDomains(
       orig_domain_b, MatmulRole::INPUT_B, ndims_out);
 
-  for (auto idx : c10::irange(ndims_out - 1)) {
-    std::vector<IterDomain*> input_ids;
-    input_ids.reserve(2);
-    if (mapping_a[idx] != nullptr) {
-      input_ids.emplace_back(mapping_a[idx]);
-    }
-    if (mapping_b[idx] != nullptr) {
-      input_ids.emplace_back(mapping_b[idx]);
-    }
-    out_domain[idx] = ops::newOutputIterDomain(input_ids);
+  for (auto idx : c10::irange(ndims_out - 1)){
+    out_domain[idx] = ops::newOutputIterDomain({mapping_a.at(idx), mapping_b.at(idx)});
   }
 
   out_domain[ndims_out - 1] = ops::newOutputIterDomain(

From 89e258f5478550ccbc9261e201f9222e492e18c7 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 22:25:03 +0000
Subject: [PATCH 24/30] check id mapping

---
 csrc/ops/composite.cpp                    | 20 ++++----
 csrc/ops/utils.cpp                        |  2 +-
 tests/cpp/test_matmul_aten_evaluation.cpp | 61 +++++++++++++++++++++--
 3 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 16dec3df970..b7194c3bc10 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -57,23 +57,23 @@ TensorView* dropout_backward(TensorView* dy, TensorView* mask, Val* scale) {
 namespace {
 
 static TensorView* newForLinear(
-    TensorView* tv_a,
-    TensorView* tv_b,
+    TensorView* input,
+    TensorView* weight,
     TensorView* bias) {
-  auto orig_domain_a =
-      TensorDomain::noReductions(tv_a->getMaybeRFactorDomain());
-  auto orig_domain_b =
-      TensorDomain::noReductions(tv_b->getMaybeRFactorDomain());
+  auto input_domain =
+      TensorDomain::noReductions(input->getMaybeRFactorDomain());
+  auto weight_domain =
+      TensorDomain::noReductions(weight->getMaybeRFactorDomain());
 
   // Linear: a = {*, in_features}, b = {out_features, in_features} /
   // {in_features}.The linear output is {*, (out_features), rK}.
   // The first out_size -2 dimensions are as the first input, followed by out_features (if present) and an additional reduction axis K.
-  auto ndims_out = (orig_domain_a.size() - 1) + orig_domain_b.size();
+  auto ndims_out = input_domain.size() + weight_domain.size() - 1;
 
   const std::vector<IterDomain*>& mapping_a = ops::mapLinearOpIterDomains(
-      orig_domain_a, MatmulRole::INPUT_A, ndims_out);
+      input_domain, MatmulRole::INPUT_A, ndims_out);
   const std::vector<IterDomain*>& mapping_b = ops::mapLinearOpIterDomains(
-      orig_domain_b, MatmulRole::INPUT_B, ndims_out);
+      weight_domain, MatmulRole::INPUT_B, ndims_out);
   std::vector<IterDomain*> mapping_bias (ndims_out, nullptr);
   if (bias != nullptr){
     auto bias_domain = TensorDomain::noReductions(bias->getMaybeRFactorDomain());
@@ -93,7 +93,7 @@ static TensorView* newForLinear(
   TensorDomain* td = IrBuilder::create<TensorDomain>(
       out_domain, TensorDomain::getContiguityFilledWith(out_domain, true));
 
-  return IrBuilder::create<TensorView>(td, tv_a->dtype());
+  return IrBuilder::create<TensorView>(td, input->dtype());
 }
 
 } // namespace
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 532a122e0ff..fd13e245421 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -242,7 +242,7 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
         break;
       }
       case MatmulRole::INPUT_B: {
-        for (auto inx: c10::irange(inp_size - 1)) {
+        for (auto inx: c10::irange(inp_size)) {
           // Map N, K to the last two positions of the output.
           mapping[out_size - 1 - inx] = input_domain[inp_size - 1 - inx];
         }
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 21da8f9efa7..943bbd76daf 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -513,6 +513,55 @@ void checkMatmulOpIdMapping(
   }
 }
 
+// Check that ID exact mapping works as expected
+void checkLinearOpIdMapping(
+    Fusion* fusion,
+    TensorView* input,
+    TensorView* weight,
+    TensorView* bias,
+    TensorView* output) {
+  IdModel id_model(fusion);
+  const ValGraph& vg = id_model.idGraph(IdMappingMode::EXACT);
+  vg.validateConsistency();
+
+  const auto checkMapped = [&vg](IterDomain* x, IterDomain* y) -> bool {
+    if (!vg.hasGroup(x) || !vg.hasGroup(y)) {
+      return false;
+    }
+    const ValGroup& gx = vg.toGroup(x);
+    const ValGroup& gy = vg.toGroup(y);
+    return gx.get() == gy.get();
+  };
+
+   // input: [* , in_features]
+   // weight: [out_features, in_features] / [out_features]
+   // bias (optional): [out_features]/[]
+   // output = [*, (out_features), rK]
+
+  ASSERT_EQ(output->nDims(), input->nDims() + weight->nDims() - 1);
+ 
+  // Check that the first input_size - 1 dims are mapped for input
+  for (auto i: c10::irange(input->nDims() - 1)){
+    if (!input->axis(i)->isBroadcast()){
+      EXPECT_TRUE(checkMapped(input->axis(i), output->axis(i)));
+    }
+  }
+  // Check out_features dim is mapped in weight & bias if present.
+  if (weight->nDims() > 1){
+    if (!weight->axis(0)->isBroadcast()){
+      EXPECT_TRUE(checkMapped(weight->axis(0), output->axis(-2)));
+    }
+    if (bias != nullptr && bias->nDims() > 0 && !bias->axis(0)->isBroadcast()) {
+      EXPECT_TRUE(checkMapped(bias->axis(0), output->axis(-2)));
+    }
+  }
+  // Check mapping for reduction axis in input and weight
+  if (!input->axis(-1)->isBroadcast()){
+    EXPECT_TRUE(checkMapped(input->axis(-1), weight->axis(-1)));
+    EXPECT_TRUE(checkMapped(input->axis(-1), output->axis(-1)));
+  }
+}
+
 TEST_P(MatmulNodeParametrizedTest, MatmulNodeConcrete) {
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -586,11 +635,13 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   }
   fusion->addOutput(tv2);
 
+  checkLinearOpIdMapping(fusion.get(), tv0, tv1, bias, tv2);
+
   at::Tensor t0 = at::randn(a_shape, at::kHalf).cuda();
   at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
   std::optional<at::Tensor> bias_opt = std::nullopt;
   if (bias_shape.has_value()) {
-    bias_opt = bias_shape.value().empty() ? at::scalar_tensor(3.14).to(at::kHalf).cuda(): at::randn(*bias_shape, at::kHalf).cuda();
+    bias_opt = at::randn(*bias_shape, at::kHalf).cuda();
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
@@ -617,8 +668,8 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
 
   const auto& [a_shape, b_shape, bias_shape] = GetParam();
 
-  auto tv0 = makeSymbolicTensor(a_shape.size(), DataType::Half);
-  auto tv1 = makeSymbolicTensor(b_shape.size(), DataType::Half);
+  auto tv0 = makeSymbolicTensor(a_shape, DataType::Half);
+  auto tv1 = makeSymbolicTensor(b_shape, DataType::Half);
 
   TensorView* bias = nullptr;
   if (bias_shape.has_value()) {
@@ -634,11 +685,13 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   }
   fusion->addOutput(tv2);
 
+  checkLinearOpIdMapping(fusion.get(), tv0, tv1, bias, tv2);
+
   at::Tensor t0 = at::randn(a_shape, at::kHalf).cuda();
   at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
   std::optional<at::Tensor> bias_opt = std::nullopt;
   if (bias_shape.has_value()) {
-    bias_opt = bias_shape.value().empty() ? at::scalar_tensor(3.14).to(at::kHalf).cuda() : at::randn(*bias_shape, at::kHalf).cuda();
+    bias_opt = at::randn(*bias_shape, at::kHalf).cuda();
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 

From 701715340327699a2e0de98c52bf54e59b5bbf97 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 23:10:09 +0000
Subject: [PATCH 25/30] comments, error cases, K=1 cases

---
 csrc/ir/internal_nodes.h                  |  3 +--
 csrc/ops/utils.h                          |  8 ++++++++
 tests/cpp/test_matmul_aten_evaluation.cpp |  2 +-
 tests/python/pytest_input_generators.py   | 14 ++++++++------
 tests/python/pytest_ops.py                |  4 +++-
 5 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 3dc644e10af..9bf493c5f7e 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2288,8 +2288,7 @@ class MatmulOp : public Expr {
       const std::vector<PolymorphicValue>& inputs) const override;
 };
 
-// Linear Operator to be expression evaluated without decomposition.
-// This node has the same functionality as F.linear (https://pytorch.org/docs/stable/generated/torch.nn.functional.linear.html#torch.nn.functional.linear)
+// Linear node with same functionality as F.linear (https://pytorch.org/docs/stable/generated/torch.nn.functional.linear.html#torch.nn.functional.linear)
 class LinearOp : public Expr {
  public:
   using Expr::Expr;
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index f6750ef9f5e..75d7f413193 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -46,6 +46,10 @@ IterType promoteIterType(IterType type1, IterType type2);
 // Mapping B: {nullptr, id_N})
 // 3. A/B are atleast 1D and one of them is > 2D: [B, M, K] x [K, N] -> [B, M,
 // N] (Mapping A: {id_B, id_M, nullptr}, Mapping B: {nullptr, nullptr, id_N})
+// Args: 
+// 1. input_domain: root/rfactor domain without reductions for any input to MatmulOp
+// 2. input_role: Specifies if the input is A / B (MatmulRole::Input_A/Input_B)
+// 3: out_size: MatmulOp output dimension (input and output may not be the same size).
 std::vector<IterDomain*> mapMatmulOpIterDomains(
     const std::vector<IterDomain*>& input_domain,
     MatmulRole input_role,
@@ -54,6 +58,10 @@ std::vector<IterDomain*> mapMatmulOpIterDomains(
 // For LinearOp, the output is the same as the first input (A[*, in_features])for all but the last dimension.
 // If the second input is 2D (B[out_features, in_features]), the last dimension of output is out_features.
 // If bias is 1D (bias[out_features]) it maps to the last dimension of the output.
+// Args: 
+// 1. input_domain: root/rfactor domain without reductions for any input to LinearOp
+// 2. input_role: Specifies if the input is A / B / Bias (MatmulRole::Input_A/Input_B/Input_C)
+// 3: out_size: LinearOp output dimension (input and output may not be the same size).
 std::vector<IterDomain*> mapLinearOpIterDomains(
     const std::vector<IterDomain*>& input_domain,
     MatmulRole input_role,
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 943bbd76daf..b1c9a39c499 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -769,7 +769,7 @@ INSTANTIATE_TEST_SUITE_P(
     LinearReductionAxisIsOne,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({m, 1}), Sizes({b, m, 1})),
+        testing::Values(Sizes({1}), Sizes({m, 1}), Sizes({b, m, 1}), Sizes({1, 1}), Sizes({b, 1, 1})),
         testing::Values(Sizes({n, 1})),
         testing::Values(Sizes({}), Sizes({n}))));
 
diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index d7cfb152f37..c87537dd0ed 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1558,13 +1558,15 @@ def linear_error_generator(op, dtype=torch.float32, requires_grad: bool = False,
         "Expected B to be a 2D matrix if bias is given, got 1D.",
     )
 
-    # mismatched_bias_extent = (
-    #     ((M, K), (1, K), (N)),
-    #     RuntimeError,
-    #     f"The expanded size of the tensor (1) must match the existing size ({N}) at non-singleton dimension 1.  Target sizes: [{M}, 1].  Tensor sizes: [{N}]",
-    # )
+    mismatched_bias_extent = (
+        ((M, K), (1, K), (N)),
+        RuntimeError,
+        f"The expanded size of the tensor (1) must match the existing size ({N}) at non-singleton dimension 1.  Target sizes: [{M}, 1].  Tensor sizes: [{N}]",
+    )
+
+    error_cases = [bias_with_1dweight, mismatched_bias_extent]
 
-    for input_shapes, ex_type, ex_str in [bias_with_1dweight]:
+    for input_shapes, ex_type, ex_str in error_cases:
         shape_input, shape_weight, shape_bias = input_shapes
         print (input_shapes)
         yield SampleInput(
diff --git a/tests/python/pytest_ops.py b/tests/python/pytest_ops.py
index ffcb8e7d535..216c687a418 100644
--- a/tests/python/pytest_ops.py
+++ b/tests/python/pytest_ops.py
@@ -209,10 +209,12 @@ def errors_test_fn(
     fd.execute(parse_args_fusion_execution(nvf_op, *sample.args))
 
 
-# A pair of parentheses () represents a capture group in regex.
+# A pair of parentheses ()/[] represents a capture group in regex.
 # Escape parenthesis in regex string to match raw characters.
 def _regex_escape_parenthesis(a: str) -> str:
     b = a.replace(r"(", r"\(")
+    b = b.replace(r"[", r"\[")
+    b = b.replace(r"]", r"\]")
     return b.replace(r")", r"\)")
 
 

From 1d501acd7f358024282d02b28db185be8b67b0cc Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Thu, 16 May 2024 23:15:10 +0000
Subject: [PATCH 26/30] remove unused fn

---
 csrc/ops/utils.cpp | 20 --------------------
 csrc/ops/utils.h   |  3 ---
 2 files changed, 23 deletions(-)

diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index fd13e245421..493b8679468 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -361,26 +361,6 @@ IterDomain* newOutputIterDomain(
 #pragma GCC diagnostic pop
 #endif
 
-std::vector<IterDomain*> newOutputDomain(const std::vector<std::vector<IterDomain*>>& input_ids) {
-  NVF_CHECK(
-      !input_ids.empty(),
-      "Tried to create new output Tensorview but received empty list.");
-
-  std::vector<IterDomain*> out_domain(input_ids.front().size(), nullptr);
-
-  for (const auto dim_i : c10::irange(out_domain.size())) {
-    std::vector<IterDomain*> ids_i;
-    ids_i.reserve(input_ids.size());
-    for (auto ids : input_ids) {
-      if (ids[dim_i] != nullptr){
-        ids_i.emplace_back(ids[dim_i]);
-      }
-    }
-    out_domain[dim_i] = newOutputIterDomain(ids_i);
-  }
-  return out_domain;
-}
-
 std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals) {
   std::vector<TensorView*> tvs;
   for (auto val : vals) {
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index 75d7f413193..42e887c7d31 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -76,9 +76,6 @@ IterDomain* newOutputIterDomain(
     const std::vector<IterDomain*>& ids,
     const std::optional<IterType> force_iter_type = std::nullopt);
 
-// Takes multiple vectors of input iterdomains and assumes they are aligned to create the output tensorview.
-std::vector<IterDomain*> newOutputDomain(const std::vector<std::vector<IterDomain*>>& input_ids);
-
 // Takes a vector of tensorviews and assumes they are all aligned to create the
 // output tensorview. For eg: BinaryOp.
 std::vector<IterDomain*> newOutputDomain(const std::vector<Val*>& vals);

From 4e5a11e392d997e62ee8fca376cd97b2ad0503f0 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Fri, 17 May 2024 02:59:25 +0000
Subject: [PATCH 27/30] reuse code

---
 csrc/ops/composite.cpp                    |  2 +-
 csrc/root_domain_map.cpp                  | 36 +++++-------
 tests/cpp/test_matmul_aten_evaluation.cpp | 68 ++++++++++-------------
 3 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index b7194c3bc10..26670166c32 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -80,7 +80,7 @@ static TensorView* newForLinear(
     mapping_bias = ops::mapLinearOpIterDomains(bias_domain, MatmulRole::INPUT_C, ndims_out);
   }
 
-  std::vector<IterDomain*> out_domain = ops::newOutputDomain({mapping_a, mapping_b, mapping_bias});
+  std::vector<IterDomain*> out_domain(ndims_out, nullptr);
 
   for (auto idx : c10::irange(ndims_out - 1)){
     out_domain[idx] = ops::newOutputIterDomain({mapping_a.at(idx), mapping_b.at(idx), mapping_bias.at(idx)});
diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index 0f68bf88270..050f2208058 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -165,6 +165,18 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     }
   };
 
+  // Assumes producer and consumer IDs to be trivially aligned and adds them to domain map.
+  auto pairwiseMapAllIds = [&](std::vector<IterDomain*> producer_ids, std::vector<IterDomain*> consumer_ids){
+    for (auto idx : c10::irange(consumer_ids.size())) {
+      IterDomain* producer_id = producer_ids.at(idx);
+      IterDomain* consumer_id = consumer_ids.at(idx);
+      if (producer_id == nullptr) {
+        continue;
+      }
+      updatePairwiseRootDomainMap(producer_id, consumer_id);
+    }
+  };
+  
   // For MatmulOp, use the corresponding mapped input iterdomains.
   if (MatmulOp* op = dynamic_cast<MatmulOp*>(consumer_tv_->definition())) {
     // Check if the producer is lhs/rhs input
@@ -183,18 +195,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     // maps to the third output iterdomain.
     const std::vector<IterDomain*>& aligned_producer_ids =
         ops::mapMatmulOpIterDomains(producer_root, input_role, out_size);
-
-    NVF_ERROR(aligned_producer_ids.size() == consumer_root.size());
-
-    for (auto inx : c10::irange(out_size)) {
-      IterDomain* producer_id = aligned_producer_ids.at(inx);
-      IterDomain* consumer_id = consumer_root.at(inx);
-      if (producer_id == nullptr) {
-        continue;
-      }
-      updatePairwiseRootDomainMap(producer_id, consumer_id);
-    }
-
+    pairwiseMapAllIds(aligned_producer_ids, consumer_root);
     return dom_map;
   }
 
@@ -221,16 +222,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
 
     const std::vector<IterDomain*>& aligned_producer_ids =
         ops::mapLinearOpIterDomains(producer_root, input_role.value(), out_size);
-
-    for (auto inx : c10::irange(out_size)) {
-      IterDomain* producer_id = aligned_producer_ids.at(inx);
-      IterDomain* consumer_id = consumer_root.at(inx);
-      if (producer_id == nullptr) {
-        continue;
-      }
-      updatePairwiseRootDomainMap(producer_id, consumer_id);
-    }
-
+    pairwiseMapAllIds(aligned_producer_ids, consumer_root);
     return dom_map;
   }
 
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index b1c9a39c499..bb73b4e9c4b 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -423,6 +423,16 @@ TEST_F(MatmulATenEvaluationTest, LinearWithBias) {
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 
+
+const bool checkMapped (const ValGraph& vg, IterDomain* x, IterDomain* y){
+  if (!vg.hasGroup(x) || !vg.hasGroup(y)) {
+    return false;
+  }
+  const ValGroup& gx = vg.toGroup(x);
+  const ValGroup& gy = vg.toGroup(y);
+  return gx.get() == gy.get();
+};
+
 // Check that ID exact mapping works as expected
 void checkMatmulOpIdMapping(
     Fusion* fusion,
@@ -433,15 +443,6 @@ void checkMatmulOpIdMapping(
   const ValGraph& vg = id_model.idGraph(IdMappingMode::EXACT);
   vg.validateConsistency();
 
-  const auto checkMapped = [&vg](IterDomain* x, IterDomain* y) -> bool {
-    if (!vg.hasGroup(x) || !vg.hasGroup(y)) {
-      return false;
-    }
-    const ValGroup& gx = vg.toGroup(x);
-    const ValGroup& gy = vg.toGroup(y);
-    return gx.get() == gy.get();
-  };
-
   // If K is Broadcast then we will not have a reduction dim
   bool k_bcast = A->axis(-1)->isBroadcast();
   int64_t red_dims = k_bcast ? 0 : 1;
@@ -453,44 +454,44 @@ void checkMatmulOpIdMapping(
     EXPECT_EQ(output->nDims(), 0);
     // When K is Broadcast, we squeeze then multiply then cast instead
     if (!k_bcast) {
-      EXPECT_TRUE(checkMapped(A->axis(0), B->axis(0))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(0), B->axis(0))); // K
     }
   } else if (A->nDims() > 1 && B->nDims() == 1) {
     // [..., iM, iK] @ [iK] = [..., iM, rK]
     ASSERT_EQ(output->nDims(), A->nDims() + red_dims - 1);
-    EXPECT_TRUE(checkMapped(A->axis(-2), output->axis(-1 - red_dims))); // M
+    EXPECT_TRUE(checkMapped(vg, A->axis(-2), output->axis(-1 - red_dims))); // M
     if (!k_bcast) {
-      EXPECT_TRUE(checkMapped(A->axis(-1), B->axis(0))); // K
-      EXPECT_TRUE(checkMapped(A->axis(-1), output->axis(-1))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(-1), B->axis(0))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(-1), output->axis(-1))); // K
     }
     // Check that batch dims are mapped
     for (int64_t i : c10::irange(output->nDims() - red_dims - 1)) {
       if (!A->axis(i)->isBroadcast()) {
-        EXPECT_TRUE(checkMapped(A->axis(i), output->axis(i)));
+        EXPECT_TRUE(checkMapped(vg, A->axis(i), output->axis(i)));
       }
     }
   } else if (A->nDims() == 1 && B->nDims() > 1) {
     // [iK] @ [..., iK, iN] = [..., iN, rK]
     ASSERT_EQ(output->nDims(), B->nDims() + red_dims - 1);
-    EXPECT_TRUE(checkMapped(B->axis(-1), output->axis(-1 - red_dims))); // N
+    EXPECT_TRUE(checkMapped(vg, B->axis(-1), output->axis(-1 - red_dims))); // N
     if (!k_bcast) {
-      EXPECT_TRUE(checkMapped(A->axis(0), B->axis(-2))); // K
-      EXPECT_TRUE(checkMapped(A->axis(0), output->axis(-1))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(0), B->axis(-2))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(0), output->axis(-1))); // K
     }
     // Check that batch dims are mapped
     for (int64_t i : c10::irange(output->nDims() - red_dims - 1)) {
       if (!B->axis(i)->isBroadcast()) {
-        EXPECT_TRUE(checkMapped(B->axis(i), output->axis(i)));
+        EXPECT_TRUE(checkMapped(vg, B->axis(i), output->axis(i)));
       }
     }
   } else if (A->nDims() > 1 && B->nDims() > 1) {
     // [..., iM, iK] @ [..., iK, iN] = [..., iM, iN, rK]
     ASSERT_EQ(output->nDims(), std::max(A->nDims(), B->nDims()) + red_dims);
-    EXPECT_TRUE(checkMapped(A->axis(-2), output->axis(-2 - red_dims))); // M
-    EXPECT_TRUE(checkMapped(B->axis(-1), output->axis(-1 - red_dims))); // N
+    EXPECT_TRUE(checkMapped(vg, A->axis(-2), output->axis(-2 - red_dims))); // M
+    EXPECT_TRUE(checkMapped(vg, B->axis(-1), output->axis(-1 - red_dims))); // N
     if (!k_bcast) {
-      EXPECT_TRUE(checkMapped(A->axis(-1), B->axis(-2))); // K
-      EXPECT_TRUE(checkMapped(A->axis(-1), output->axis(-1))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(-1), B->axis(-2))); // K
+      EXPECT_TRUE(checkMapped(vg, A->axis(-1), output->axis(-1))); // K
     }
     // Check that batch dims are mapped
     // Note that A and B can have different dimensions, so here we count
@@ -501,10 +502,10 @@ void checkMatmulOpIdMapping(
       int64_t i_b = B->nDims() - 3 - i;
       int64_t i_out = output->nDims() - red_dims - 3 - i;
       if (i_a >= 0 && !A->axis(i_a)->isBroadcast()) {
-        EXPECT_TRUE(checkMapped(A->axis(i_a), output->axis(i_out)));
+        EXPECT_TRUE(checkMapped(vg, A->axis(i_a), output->axis(i_out)));
       }
       if (i_b >= 0 && !B->axis(i_b)->isBroadcast()) {
-        EXPECT_TRUE(checkMapped(B->axis(i_b), output->axis(i_out)));
+        EXPECT_TRUE(checkMapped(vg, B->axis(i_b), output->axis(i_out)));
       }
     }
   } else {
@@ -524,15 +525,6 @@ void checkLinearOpIdMapping(
   const ValGraph& vg = id_model.idGraph(IdMappingMode::EXACT);
   vg.validateConsistency();
 
-  const auto checkMapped = [&vg](IterDomain* x, IterDomain* y) -> bool {
-    if (!vg.hasGroup(x) || !vg.hasGroup(y)) {
-      return false;
-    }
-    const ValGroup& gx = vg.toGroup(x);
-    const ValGroup& gy = vg.toGroup(y);
-    return gx.get() == gy.get();
-  };
-
    // input: [* , in_features]
    // weight: [out_features, in_features] / [out_features]
    // bias (optional): [out_features]/[]
@@ -543,22 +535,22 @@ void checkLinearOpIdMapping(
   // Check that the first input_size - 1 dims are mapped for input
   for (auto i: c10::irange(input->nDims() - 1)){
     if (!input->axis(i)->isBroadcast()){
-      EXPECT_TRUE(checkMapped(input->axis(i), output->axis(i)));
+      EXPECT_TRUE(checkMapped(vg, input->axis(i), output->axis(i)));
     }
   }
   // Check out_features dim is mapped in weight & bias if present.
   if (weight->nDims() > 1){
     if (!weight->axis(0)->isBroadcast()){
-      EXPECT_TRUE(checkMapped(weight->axis(0), output->axis(-2)));
+      EXPECT_TRUE(checkMapped(vg, weight->axis(0), output->axis(-2)));
     }
     if (bias != nullptr && bias->nDims() > 0 && !bias->axis(0)->isBroadcast()) {
-      EXPECT_TRUE(checkMapped(bias->axis(0), output->axis(-2)));
+      EXPECT_TRUE(checkMapped(vg, bias->axis(0), output->axis(-2)));
     }
   }
   // Check mapping for reduction axis in input and weight
   if (!input->axis(-1)->isBroadcast()){
-    EXPECT_TRUE(checkMapped(input->axis(-1), weight->axis(-1)));
-    EXPECT_TRUE(checkMapped(input->axis(-1), output->axis(-1)));
+    EXPECT_TRUE(checkMapped(vg, input->axis(-1), weight->axis(-1)));
+    EXPECT_TRUE(checkMapped(vg, input->axis(-1), output->axis(-1)));
   }
 }
 

From c41dd513d029a99c67f98a10acd0289973890d8f Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Fri, 17 May 2024 03:23:31 +0000
Subject: [PATCH 28/30] lint

---
 csrc/ir/internal_nodes.h                  |  3 +-
 csrc/ops/composite.cpp                    | 58 ++++++++++++++---------
 csrc/ops/composite.h                      |  6 ++-
 csrc/ops/utils.cpp                        | 51 ++++++++++----------
 csrc/ops/utils.h                          | 25 ++++++----
 csrc/root_domain_map.cpp                  | 13 +++--
 tests/cpp/test_matmul_aten_evaluation.cpp | 44 +++++++++++------
 tests/python/pytest_input_generators.py   |  8 ++--
 8 files changed, 124 insertions(+), 84 deletions(-)

diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index 9bf493c5f7e..0ddd0a704a1 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -2288,7 +2288,8 @@ class MatmulOp : public Expr {
       const std::vector<PolymorphicValue>& inputs) const override;
 };
 
-// Linear node with same functionality as F.linear (https://pytorch.org/docs/stable/generated/torch.nn.functional.linear.html#torch.nn.functional.linear)
+// Linear node with same functionality as F.linear
+// (https://pytorch.org/docs/stable/generated/torch.nn.functional.linear.html#torch.nn.functional.linear)
 class LinearOp : public Expr {
  public:
   using Expr::Expr;
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index 26670166c32..4685e0cf5ab 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -67,23 +67,27 @@ static TensorView* newForLinear(
 
   // Linear: a = {*, in_features}, b = {out_features, in_features} /
   // {in_features}.The linear output is {*, (out_features), rK}.
-  // The first out_size -2 dimensions are as the first input, followed by out_features (if present) and an additional reduction axis K.
+  // The first out_size -2 dimensions are as the first input, followed by
+  // out_features (if present) and an additional reduction axis K.
   auto ndims_out = input_domain.size() + weight_domain.size() - 1;
 
-  const std::vector<IterDomain*>& mapping_a = ops::mapLinearOpIterDomains(
-      input_domain, MatmulRole::INPUT_A, ndims_out);
+  const std::vector<IterDomain*>& mapping_a =
+      ops::mapLinearOpIterDomains(input_domain, MatmulRole::INPUT_A, ndims_out);
   const std::vector<IterDomain*>& mapping_b = ops::mapLinearOpIterDomains(
       weight_domain, MatmulRole::INPUT_B, ndims_out);
-  std::vector<IterDomain*> mapping_bias (ndims_out, nullptr);
-  if (bias != nullptr){
-    auto bias_domain = TensorDomain::noReductions(bias->getMaybeRFactorDomain());
-    mapping_bias = ops::mapLinearOpIterDomains(bias_domain, MatmulRole::INPUT_C, ndims_out);
+  std::vector<IterDomain*> mapping_bias(ndims_out, nullptr);
+  if (bias != nullptr) {
+    auto bias_domain =
+        TensorDomain::noReductions(bias->getMaybeRFactorDomain());
+    mapping_bias = ops::mapLinearOpIterDomains(
+        bias_domain, MatmulRole::INPUT_C, ndims_out);
   }
 
   std::vector<IterDomain*> out_domain(ndims_out, nullptr);
 
-  for (auto idx : c10::irange(ndims_out - 1)){
-    out_domain[idx] = ops::newOutputIterDomain({mapping_a.at(idx), mapping_b.at(idx), mapping_bias.at(idx)});
+  for (auto idx : c10::irange(ndims_out - 1)) {
+    out_domain[idx] = ops::newOutputIterDomain(
+        {mapping_a.at(idx), mapping_b.at(idx), mapping_bias.at(idx)});
   }
   // Specify the iterdomain for K as reduction
   out_domain[ndims_out - 1] = ops::newOutputIterDomain(
@@ -99,14 +103,21 @@ static TensorView* newForLinear(
 } // namespace
 
 TensorView* linear(TensorView* input, TensorView* weight, TensorView* bias) {
-  auto input_ndims = TensorDomain::noReductions(input->getMaybeRFactorDomain()).size();
+  auto input_ndims =
+      TensorDomain::noReductions(input->getMaybeRFactorDomain()).size();
   NVF_CHECK(input_ndims > 0, "Input A must be atleast 1D.");
-  
-  auto weight_ndims = TensorDomain::noReductions(weight->getMaybeRFactorDomain()).size();
-  NVF_CHECK(weight_ndims == 1 || weight_ndims == 2, "Input B must be a 1D / 2D tensor.");
 
-  // Note: This constraint is not documented but F.linear errors out if bias is given with 1D weights.
-  NVF_CHECK(weight_ndims == 2 || bias == nullptr, "Expected B to be a 2D matrix if bias is given, got 1D.")
+  auto weight_ndims =
+      TensorDomain::noReductions(weight->getMaybeRFactorDomain()).size();
+  NVF_CHECK(
+      weight_ndims == 1 || weight_ndims == 2,
+      "Input B must be a 1D / 2D tensor.");
+
+  // Note: This constraint is not documented but F.linear errors out if bias is
+  // given with 1D weights.
+  NVF_CHECK(
+      weight_ndims == 2 || bias == nullptr,
+      "Expected B to be a 2D matrix if bias is given, got 1D.")
 
   NVF_CHECK(
       input->dtype() == weight->dtype(),
@@ -114,11 +125,13 @@ TensorView* linear(TensorView* input, TensorView* weight, TensorView* bias) {
       input->dtype(),
       " and ",
       weight->dtype());
-  
+
   NVF_CHECK(
-    bias == nullptr || bias->dtype() == input->dtype(),
-    "Expected bias to have the same dtype as A and B, got: ", bias->dtype(), " and ", input->dtype()
-  );
+      bias == nullptr || bias->dtype() == input->dtype(),
+      "Expected bias to have the same dtype as A and B, got: ",
+      bias->dtype(),
+      " and ",
+      input->dtype());
   // For all other cases, create a new LinearOp
   TensorView* out = newForLinear(input, weight, bias);
   IrBuilder::create<LinearOp>(out, input, weight, bias);
@@ -323,14 +336,15 @@ static TensorView* newForMatmul(TensorView* tv_a, TensorView* tv_b) {
   }
 
   std::vector<IterDomain*> out_domain(ndims_out, nullptr);
-  
+
   const std::vector<IterDomain*>& mapping_a = ops::mapMatmulOpIterDomains(
       orig_domain_a, MatmulRole::INPUT_A, ndims_out);
   const std::vector<IterDomain*>& mapping_b = ops::mapMatmulOpIterDomains(
       orig_domain_b, MatmulRole::INPUT_B, ndims_out);
 
-  for (auto idx : c10::irange(ndims_out - 1)){
-    out_domain[idx] = ops::newOutputIterDomain({mapping_a.at(idx), mapping_b.at(idx)});
+  for (auto idx : c10::irange(ndims_out - 1)) {
+    out_domain[idx] =
+        ops::newOutputIterDomain({mapping_a.at(idx), mapping_b.at(idx)});
   }
 
   out_domain[ndims_out - 1] = ops::newOutputIterDomain(
diff --git a/csrc/ops/composite.h b/csrc/ops/composite.h
index 3cd38a5d5da..0ef555ebc59 100644
--- a/csrc/ops/composite.h
+++ b/csrc/ops/composite.h
@@ -47,8 +47,10 @@ NVF_API LstmResult lstm(
     TensorView* cell_x,
     TensorView* out_x);
 
-// Linear functions which takes in two tensors of shapes input[* , in_features], weight[out_features, in_features] / [in_features] and an optional bias of shape [out_features] or 0D scalar.
-// Bias can only be given if weight is a 2-D tensor. 
+// Linear functions which takes in two tensors of shapes input[* , in_features],
+// weight[out_features, in_features] / [in_features] and an optional bias of
+// shape [out_features] or 0D scalar. Bias can only be given if weight is a 2-D
+// tensor.
 TensorView* linear(TensorView* input, TensorView* weight, TensorView* bias);
 // This is an implementation detail to reflect when linear is called
 // without a bias. This calls the above function. We use this function
diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp
index 493b8679468..43570a50ede 100644
--- a/csrc/ops/utils.cpp
+++ b/csrc/ops/utils.cpp
@@ -225,7 +225,6 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
     const std::vector<IterDomain*>& input_domain,
     MatmulRole input_role,
     size_t out_size) {
-
   std::vector<IterDomain*> mapping(out_size, nullptr);
   auto inp_size = input_domain.size();
 
@@ -233,30 +232,30 @@ std::vector<IterDomain*> mapLinearOpIterDomains(
   // Input B: {*, N, K} / {K}
   // Bias: {N} / {}
   switch (input_role) {
-      case MatmulRole::INPUT_A: {
-        // Linear output is same as input for all but the last dimension
-        for (auto inx : c10::irange(inp_size - 1)) {
-          mapping[inx] = input_domain[inx];
-        }
-        mapping[out_size - 1] = input_domain.back();
-        break;
+    case MatmulRole::INPUT_A: {
+      // Linear output is same as input for all but the last dimension
+      for (auto inx : c10::irange(inp_size - 1)) {
+        mapping[inx] = input_domain[inx];
       }
-      case MatmulRole::INPUT_B: {
-        for (auto inx: c10::irange(inp_size)) {
-          // Map N, K to the last two positions of the output.
-          mapping[out_size - 1 - inx] = input_domain[inp_size - 1 - inx];
-        }
-        break;
+      mapping[out_size - 1] = input_domain.back();
+      break;
+    }
+    case MatmulRole::INPUT_B: {
+      for (auto inx : c10::irange(inp_size)) {
+        // Map N, K to the last two positions of the output.
+        mapping[out_size - 1 - inx] = input_domain[inp_size - 1 - inx];
       }
-      case MatmulRole::INPUT_C: {
-        if (inp_size > 0){
-          // Bias is 1D tensor of shape {out_features}
-          mapping[out_size - 2] = input_domain[0];
-        }
-        break;
+      break;
+    }
+    case MatmulRole::INPUT_C: {
+      if (inp_size > 0) {
+        // Bias is 1D tensor of shape {out_features}
+        mapping[out_size - 2] = input_domain[0];
       }
-      default:
-        NVF_ERROR("Unexpected input type.");
+      break;
+    }
+    default:
+      NVF_ERROR("Unexpected input type.");
   }
   return mapping;
 }
@@ -287,10 +286,10 @@ IterDomain* newOutputIterDomain(
 
   // Filter out any nullptrs
   std::copy_if(
-    input_ids.begin(),
-    input_ids.end(),
-    std::back_inserter(ids),
-    [](IterDomain* id) { return id!=nullptr;});
+      input_ids.begin(),
+      input_ids.end(),
+      std::back_inserter(ids),
+      [](IterDomain* id) { return id != nullptr; });
 
   for (auto id : ids) {
     if (id->isBroadcast()) {
diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h
index 42e887c7d31..5c0982bb39e 100644
--- a/csrc/ops/utils.h
+++ b/csrc/ops/utils.h
@@ -46,22 +46,27 @@ IterType promoteIterType(IterType type1, IterType type2);
 // Mapping B: {nullptr, id_N})
 // 3. A/B are atleast 1D and one of them is > 2D: [B, M, K] x [K, N] -> [B, M,
 // N] (Mapping A: {id_B, id_M, nullptr}, Mapping B: {nullptr, nullptr, id_N})
-// Args: 
-// 1. input_domain: root/rfactor domain without reductions for any input to MatmulOp
+// Args:
+// 1. input_domain: root/rfactor domain without reductions for any input to
+// MatmulOp
 // 2. input_role: Specifies if the input is A / B (MatmulRole::Input_A/Input_B)
-// 3: out_size: MatmulOp output dimension (input and output may not be the same size).
+// 3: out_size: MatmulOp output dimension (input and output may not be the same
+// size).
 std::vector<IterDomain*> mapMatmulOpIterDomains(
     const std::vector<IterDomain*>& input_domain,
     MatmulRole input_role,
     size_t out_size);
 
-// For LinearOp, the output is the same as the first input (A[*, in_features])for all but the last dimension.
-// If the second input is 2D (B[out_features, in_features]), the last dimension of output is out_features.
-// If bias is 1D (bias[out_features]) it maps to the last dimension of the output.
-// Args: 
-// 1. input_domain: root/rfactor domain without reductions for any input to LinearOp
-// 2. input_role: Specifies if the input is A / B / Bias (MatmulRole::Input_A/Input_B/Input_C)
-// 3: out_size: LinearOp output dimension (input and output may not be the same size).
+// For LinearOp, the output is the same as the first input (A[*,
+// in_features])for all but the last dimension. If the second input is 2D
+// (B[out_features, in_features]), the last dimension of output is out_features.
+// If bias is 1D (bias[out_features]) it maps to the last dimension of the
+// output. Args:
+// 1. input_domain: root/rfactor domain without reductions for any input to
+// LinearOp
+// 2. input_role: Specifies if the input is A / B / Bias
+// (MatmulRole::Input_A/Input_B/Input_C) 3: out_size: LinearOp output dimension
+// (input and output may not be the same size).
 std::vector<IterDomain*> mapLinearOpIterDomains(
     const std::vector<IterDomain*>& input_domain,
     MatmulRole input_role,
diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index 050f2208058..d8dcea09ae8 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -165,8 +165,10 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     }
   };
 
-  // Assumes producer and consumer IDs to be trivially aligned and adds them to domain map.
-  auto pairwiseMapAllIds = [&](std::vector<IterDomain*> producer_ids, std::vector<IterDomain*> consumer_ids){
+  // Assumes producer and consumer IDs to be trivially aligned and adds them to
+  // domain map.
+  auto pairwiseMapAllIds = [&](std::vector<IterDomain*> producer_ids,
+                               std::vector<IterDomain*> consumer_ids) {
     for (auto idx : c10::irange(consumer_ids.size())) {
       IterDomain* producer_id = producer_ids.at(idx);
       IterDomain* consumer_id = consumer_ids.at(idx);
@@ -176,7 +178,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
       updatePairwiseRootDomainMap(producer_id, consumer_id);
     }
   };
-  
+
   // For MatmulOp, use the corresponding mapped input iterdomains.
   if (MatmulOp* op = dynamic_cast<MatmulOp*>(consumer_tv_->definition())) {
     // Check if the producer is lhs/rhs input
@@ -208,7 +210,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
       input_role = MatmulRole::INPUT_A;
     } else if (producer->sameAs(op->inB()->as<TensorView>()->domain())) {
       input_role = MatmulRole::INPUT_B;
-    } else if (producer->sameAs(op->bias()->as<TensorView>()->domain())){
+    } else if (producer->sameAs(op->bias()->as<TensorView>()->domain())) {
       input_role = MatmulRole::INPUT_C;
     } else {
       NVF_ERROR(false, "Producer did not match any LinearOp input.")
@@ -221,7 +223,8 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
     // output = {*, out_features} / {*}
 
     const std::vector<IterDomain*>& aligned_producer_ids =
-        ops::mapLinearOpIterDomains(producer_root, input_role.value(), out_size);
+        ops::mapLinearOpIterDomains(
+            producer_root, input_role.value(), out_size);
     pairwiseMapAllIds(aligned_producer_ids, consumer_root);
     return dom_map;
   }
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index bb73b4e9c4b..80287793480 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -423,8 +423,7 @@ TEST_F(MatmulATenEvaluationTest, LinearWithBias) {
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
 
-
-const bool checkMapped (const ValGraph& vg, IterDomain* x, IterDomain* y){
+const bool checkMapped(const ValGraph& vg, IterDomain* x, IterDomain* y) {
   if (!vg.hasGroup(x) || !vg.hasGroup(y)) {
     return false;
   }
@@ -525,22 +524,22 @@ void checkLinearOpIdMapping(
   const ValGraph& vg = id_model.idGraph(IdMappingMode::EXACT);
   vg.validateConsistency();
 
-   // input: [* , in_features]
-   // weight: [out_features, in_features] / [out_features]
-   // bias (optional): [out_features]/[]
-   // output = [*, (out_features), rK]
+  // input: [* , in_features]
+  // weight: [out_features, in_features] / [out_features]
+  // bias (optional): [out_features]/[]
+  // output = [*, (out_features), rK]
 
   ASSERT_EQ(output->nDims(), input->nDims() + weight->nDims() - 1);
- 
+
   // Check that the first input_size - 1 dims are mapped for input
-  for (auto i: c10::irange(input->nDims() - 1)){
-    if (!input->axis(i)->isBroadcast()){
+  for (auto i : c10::irange(input->nDims() - 1)) {
+    if (!input->axis(i)->isBroadcast()) {
       EXPECT_TRUE(checkMapped(vg, input->axis(i), output->axis(i)));
     }
   }
   // Check out_features dim is mapped in weight & bias if present.
-  if (weight->nDims() > 1){
-    if (!weight->axis(0)->isBroadcast()){
+  if (weight->nDims() > 1) {
+    if (!weight->axis(0)->isBroadcast()) {
       EXPECT_TRUE(checkMapped(vg, weight->axis(0), output->axis(-2)));
     }
     if (bias != nullptr && bias->nDims() > 0 && !bias->axis(0)->isBroadcast()) {
@@ -548,7 +547,7 @@ void checkLinearOpIdMapping(
     }
   }
   // Check mapping for reduction axis in input and weight
-  if (!input->axis(-1)->isBroadcast()){
+  if (!input->axis(-1)->isBroadcast()) {
     EXPECT_TRUE(checkMapped(vg, input->axis(-1), weight->axis(-1)));
     EXPECT_TRUE(checkMapped(vg, input->axis(-1), output->axis(-1)));
   }
@@ -745,7 +744,12 @@ INSTANTIATE_TEST_SUITE_P(
     LinearWithoutBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k}), Sizes({1, k}), Sizes({b, 1, k})),
+        testing::Values(
+            Sizes({k}),
+            Sizes({m, k}),
+            Sizes({b, m, k}),
+            Sizes({1, k}),
+            Sizes({b, 1, k})),
         testing::Values(Sizes({k}), Sizes({n, k}), Sizes({1, k})),
         testing::Values(std::nullopt)));
 
@@ -753,7 +757,12 @@ INSTANTIATE_TEST_SUITE_P(
     LinearWithBias,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({k}), Sizes({m, k}), Sizes({b, m, k}), Sizes({1, k}), Sizes({b, 1, k})),
+        testing::Values(
+            Sizes({k}),
+            Sizes({m, k}),
+            Sizes({b, m, k}),
+            Sizes({1, k}),
+            Sizes({b, 1, k})),
         testing::Values(Sizes({n, k})),
         testing::Values(Sizes({}), Sizes({n}))));
 
@@ -761,7 +770,12 @@ INSTANTIATE_TEST_SUITE_P(
     LinearReductionAxisIsOne,
     LinearNodeParametrizedTest,
     testing::Combine(
-        testing::Values(Sizes({1}), Sizes({m, 1}), Sizes({b, m, 1}), Sizes({1, 1}), Sizes({b, 1, 1})),
+        testing::Values(
+            Sizes({1}),
+            Sizes({m, 1}),
+            Sizes({b, m, 1}),
+            Sizes({1, 1}),
+            Sizes({b, 1, 1})),
         testing::Values(Sizes({n, 1})),
         testing::Values(Sizes({}), Sizes({n}))));
 
diff --git a/tests/python/pytest_input_generators.py b/tests/python/pytest_input_generators.py
index c87537dd0ed..137dab7c229 100644
--- a/tests/python/pytest_input_generators.py
+++ b/tests/python/pytest_input_generators.py
@@ -1543,7 +1543,10 @@ def linear_input_generator(
             make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)
         )
 
-def linear_error_generator(op, dtype=torch.float32, requires_grad: bool = False, **kwargs):
+
+def linear_error_generator(
+    op, dtype=torch.float32, requires_grad: bool = False, **kwargs
+):
     make_arg = partial(
         make_tensor, device="cuda", dtype=dtype, requires_grad=requires_grad
     )
@@ -1568,7 +1571,6 @@ def linear_error_generator(op, dtype=torch.float32, requires_grad: bool = False,
 
     for input_shapes, ex_type, ex_str in error_cases:
         shape_input, shape_weight, shape_bias = input_shapes
-        print (input_shapes)
         yield SampleInput(
             make_arg(shape_input), make_arg(shape_weight), make_arg(shape_bias)
-        ), ex_type, ex_str
\ No newline at end of file
+        ), ex_type, ex_str

From 0acc863ade12f4cf3c5d07cbc4b6ec210e0257a6 Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Fri, 17 May 2024 03:29:02 +0000
Subject: [PATCH 29/30] add check

---
 csrc/root_domain_map.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp
index d8dcea09ae8..01b2adbd4c6 100644
--- a/csrc/root_domain_map.cpp
+++ b/csrc/root_domain_map.cpp
@@ -169,6 +169,7 @@ std::unordered_map<IterDomain*, IterDomain*> PairwiseRootDomainMap::map(
   // domain map.
   auto pairwiseMapAllIds = [&](std::vector<IterDomain*> producer_ids,
                                std::vector<IterDomain*> consumer_ids) {
+    NVF_ERROR(producer_ids.size() == consumer_ids.size());
     for (auto idx : c10::irange(consumer_ids.size())) {
       IterDomain* producer_id = producer_ids.at(idx);
       IterDomain* consumer_id = consumer_ids.at(idx);

From 8c2afd158615f6396511ab7964e5393c107f12ec Mon Sep 17 00:00:00 2001
From: root <26priya11@gmail.com>
Date: Fri, 17 May 2024 03:34:42 +0000
Subject: [PATCH 30/30] chain replace

---
 tests/python/pytest_ops.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/python/pytest_ops.py b/tests/python/pytest_ops.py
index 216c687a418..690e0294eca 100644
--- a/tests/python/pytest_ops.py
+++ b/tests/python/pytest_ops.py
@@ -212,10 +212,8 @@ def errors_test_fn(
 # A pair of parentheses ()/[] represents a capture group in regex.
 # Escape parenthesis in regex string to match raw characters.
 def _regex_escape_parenthesis(a: str) -> str:
-    b = a.replace(r"(", r"\(")
-    b = b.replace(r"[", r"\[")
-    b = b.replace(r"]", r"\]")
-    return b.replace(r")", r"\)")
+    b = a.replace(r"[", r"\[").replace(r"]", r"\]")
+    return b.replace(r"(", r"\(").replace(r")", r"\)")
 
 
 @create_op_test(tuple(op for op in opinfos if op.error_input_generator is not None))